diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index aa5ea77f17291..ffbb111d42221 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -772,6 +772,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, PreloadedScratchRsrcReg, ScratchRsrcReg, ScratchWaveOffsetReg); } + + if (ST.hasWaitXCnt()) { + // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK + // replay. This aligns hardware behavior with the compiler's s_wait_xcnt + // insertion logic, which assumes multi-group mode by default. + unsigned RegEncoding = + AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(1) + .addImm(RegEncoding); + } } // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index b5d593a9c15ed..58586129fb4e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -41,6 +41,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -71,6 +72,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -114,6 +116,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -160,6 +163,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -190,6 +194,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -233,6 +238,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -279,6 +285,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -309,6 +316,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -352,6 +360,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -397,6 +406,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -427,6 +437,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -470,6 +481,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -515,6 +527,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -545,6 +558,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -588,6 +602,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -634,6 +649,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -664,6 +680,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -707,6 +724,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -753,6 +771,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -783,6 +802,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -826,6 +846,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -871,6 +892,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -901,6 +923,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -944,6 +967,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -989,6 +1013,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1019,6 +1044,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1062,6 +1088,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1108,6 +1135,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1138,6 +1166,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1181,6 +1210,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1227,6 +1257,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1257,6 +1288,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1300,6 +1332,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1345,6 +1378,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1375,6 +1409,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1418,6 +1453,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1486,6 +1522,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 @@ -1558,6 +1595,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 @@ -1632,6 +1670,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 @@ -1704,6 +1743,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 @@ -1894,6 +1934,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 @@ -1949,6 +1990,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1991,6 +2033,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -2035,6 +2078,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -2197,6 +2241,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -2255,6 +2300,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 @@ -2320,6 +2366,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 @@ -2385,6 +2432,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-NEXT: s_mov_b32 s1, exec_lo ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 6facdfdec64ae..d16dc348209e2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -34,6 +34,7 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) { ; ; GFX1250-LABEL: abs_sgpr_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_sext_i32_i16 s0, s0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_abs_i32 s0, s0 @@ -43,10 +44,26 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) { } define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) { -; GFX-LABEL: abs_sgpr_i32: -; GFX: ; %bb.0: -; GFX-NEXT: s_abs_i32 s0, s0 -; GFX-NEXT: ; return to shader part epilog +; GFX6-LABEL: abs_sgpr_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: abs_sgpr_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_abs_i32 s0, s0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: abs_sgpr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_abs_i32 s0, s0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_abs_i32 s0, s0 +; GFX1250-NEXT: ; return to shader part epilog %res = call i32 @llvm.abs.i32(i32 %arg, i1 false) ret i32 %res } @@ -81,6 +98,7 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) { ; ; GFX1250-LABEL: abs_sgpr_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_ashr_i32 s2, s1, 31 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mov_b32 s3, s2 @@ -93,13 +111,38 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) { } define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) { -; GFX-LABEL: abs_sgpr_v4i32: -; GFX: ; %bb.0: -; GFX-NEXT: s_abs_i32 s0, s0 -; GFX-NEXT: s_abs_i32 s1, s1 -; GFX-NEXT: s_abs_i32 s2, s2 -; GFX-NEXT: s_abs_i32 s3, s3 -; GFX-NEXT: ; return to shader part epilog +; GFX6-LABEL: abs_sgpr_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: s_abs_i32 s3, s3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: abs_sgpr_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_abs_i32 s0, s0 +; GFX8-NEXT: s_abs_i32 s1, s1 +; GFX8-NEXT: s_abs_i32 s2, s2 +; GFX8-NEXT: s_abs_i32 s3, s3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: abs_sgpr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_abs_i32 s0, s0 +; GFX10-NEXT: s_abs_i32 s1, s1 +; GFX10-NEXT: s_abs_i32 s2, s2 +; GFX10-NEXT: s_abs_i32 s3, s3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_v4i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_abs_i32 s0, s0 +; GFX1250-NEXT: s_abs_i32 s1, s1 +; GFX1250-NEXT: s_abs_i32 s2, s2 +; GFX1250-NEXT: s_abs_i32 s3, s3 +; GFX1250-NEXT: ; return to shader part epilog %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false) ret <4 x i32> %res } @@ -278,13 +321,38 @@ define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { } define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) { -; GFX-LABEL: abs_sgpr_v2i8: -; GFX: ; %bb.0: -; GFX-NEXT: s_sext_i32_i8 s0, s0 -; GFX-NEXT: s_sext_i32_i8 s1, s1 -; GFX-NEXT: s_abs_i32 s0, s0 -; GFX-NEXT: s_abs_i32 s1, s1 -; GFX-NEXT: ; return to shader part epilog +; GFX6-LABEL: abs_sgpr_v2i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sext_i32_i8 s0, s0 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: abs_sgpr_v2i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NEXT: s_abs_i32 s0, s0 +; GFX8-NEXT: s_abs_i32 s1, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: abs_sgpr_v2i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i8 s0, s0 +; GFX10-NEXT: s_sext_i32_i8 s1, s1 +; GFX10-NEXT: s_abs_i32 s0, s0 +; GFX10-NEXT: s_abs_i32 s1, s1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_v2i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_sext_i32_i8 s0, s0 +; GFX1250-NEXT: s_sext_i32_i8 s1, s1 +; GFX1250-NEXT: s_abs_i32 s0, s0 +; GFX1250-NEXT: s_abs_i32 s1, s1 +; GFX1250-NEXT: ; return to shader part epilog %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false) ret <2 x i8> %res } @@ -340,15 +408,46 @@ define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { } define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) { -; GFX-LABEL: abs_sgpr_v3i8: -; GFX: ; %bb.0: -; GFX-NEXT: s_sext_i32_i8 s0, s0 -; GFX-NEXT: s_sext_i32_i8 s1, s1 -; GFX-NEXT: s_sext_i32_i8 s2, s2 -; GFX-NEXT: s_abs_i32 s0, s0 -; GFX-NEXT: s_abs_i32 s1, s1 -; GFX-NEXT: s_abs_i32 s2, s2 -; GFX-NEXT: ; return to shader part epilog +; GFX6-LABEL: abs_sgpr_v3i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sext_i32_i8 s0, s0 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 +; GFX6-NEXT: s_sext_i32_i8 s2, s2 +; GFX6-NEXT: s_abs_i32 s0, s0 +; GFX6-NEXT: s_abs_i32 s1, s1 +; GFX6-NEXT: s_abs_i32 s2, s2 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: abs_sgpr_v3i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NEXT: s_sext_i32_i8 s2, s2 +; GFX8-NEXT: s_abs_i32 s0, s0 +; GFX8-NEXT: s_abs_i32 s1, s1 +; GFX8-NEXT: s_abs_i32 s2, s2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: abs_sgpr_v3i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sext_i32_i8 s0, s0 +; GFX10-NEXT: s_sext_i32_i8 s1, s1 +; GFX10-NEXT: s_sext_i32_i8 s2, s2 +; GFX10-NEXT: s_abs_i32 s0, s0 +; GFX10-NEXT: s_abs_i32 s1, s1 +; GFX10-NEXT: s_abs_i32 s2, s2 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: abs_sgpr_v3i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_sext_i32_i8 s0, s0 +; GFX1250-NEXT: s_sext_i32_i8 s1, s1 +; GFX1250-NEXT: s_sext_i32_i8 s2, s2 +; GFX1250-NEXT: s_abs_i32 s0, s0 +; GFX1250-NEXT: s_abs_i32 s1, s1 +; GFX1250-NEXT: s_abs_i32 s2, s2 +; GFX1250-NEXT: ; return to shader part epilog %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false) ret <3 x i8> %res } @@ -446,6 +545,7 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ; ; GFX1250-LABEL: abs_sgpr_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_sext_i32_i16 s1, s0 ; GFX1250-NEXT: s_ashr_i32 s0, s0, 16 ; GFX1250-NEXT: s_abs_i32 s1, s1 @@ -536,6 +636,7 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) { ; ; GFX1250-LABEL: abs_sgpr_v3i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_sext_i32_i16 s2, s0 ; GFX1250-NEXT: s_ashr_i32 s0, s0, 16 ; GFX1250-NEXT: s_abs_i32 s2, s2 @@ -598,3 +699,5 @@ define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false) ret <3 x i16> %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index efa51ead0d196..1de5e136c400d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -930,6 +930,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; ; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX1250-UNALIGNED: ; %bb.0: +; GFX1250-UNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] ; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0 @@ -940,6 +941,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; ; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX1250-NOUNALIGNED: ; %bb.0: +; GFX1250-NOUNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3 @@ -1208,6 +1210,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; ; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX1250-UNALIGNED: ; %bb.0: +; GFX1250-UNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1] ; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0 @@ -1218,6 +1221,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; ; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX1250-NOUNALIGNED: ; %bb.0: +; GFX1250-NOUNALIGNED-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5 ; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2 ; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6 @@ -1362,6 +1366,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg ; ; GFX1250-LABEL: s_load_constant_v3i32_align4: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s4, s0 ; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 @@ -1413,6 +1418,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { ; ; GFX1250-LABEL: s_load_constant_i96_align8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s4, s0 ; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 @@ -1464,6 +1470,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg ; ; GFX1250-LABEL: s_load_constant_v3i32_align8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s4, s0 ; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 @@ -1515,6 +1522,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg ; ; GFX1250-LABEL: s_load_constant_v6i16_align8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s4, s0 ; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 @@ -1593,6 +1601,7 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg ; ; GFX1250-LABEL: s_load_constant_v12i8_align8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s4, s0 ; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 @@ -1670,11 +1679,24 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v3i32_align16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align16: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align16: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v3i32_align16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GCN-LABEL: s_load_constant_v3i32_align16: ; GCN: ; %bb.0: @@ -1684,3 +1706,5 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16 ret <3 x i32> %load } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll index 43c8f46f98cfc..62b8b55072089 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll @@ -132,6 +132,7 @@ define i64 @test_abs_i64(i64 %a) { define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) { ; CHECK-LABEL: test_umin_i64_s: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-NEXT: v_min_u64 v[0:1], s[0:1], s[2:3] ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -144,6 +145,7 @@ define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) { define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) { ; CHECK-LABEL: test_umax_i64_s: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-NEXT: v_max_u64 v[0:1], s[0:1], s[2:3] ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -156,6 +158,7 @@ define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) { define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) { ; CHECK-LABEL: test_smin_i64_s: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-NEXT: v_min_i64 v[0:1], s[0:1], s[2:3] ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -168,6 +171,7 @@ define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) { define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) { ; CHECK-LABEL: test_smax_i64_s: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-NEXT: v_max_i64 v[0:1], s[0:1], s[2:3] ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -180,6 +184,7 @@ define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) { define amdgpu_ps i64 @test_abs_i64_s(i64 inreg %a) { ; CHECK-LABEL: test_abs_i64_s: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-NEXT: s_ashr_i32 s2, s1, 31 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_mov_b32 s3, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 3eecaccf0308f..34e4931674cce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -26,6 +26,7 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; ; GFX1250-LABEL: s_mul_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog %result = mul i16 %num, %den @@ -125,6 +126,7 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; ; GFX1250-LABEL: s_mul_i16_zeroext: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 @@ -220,6 +222,7 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre ; ; GFX1250-LABEL: s_mul_i16_signext: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_sext_i32_i16 s0, s0 @@ -315,6 +318,7 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) { ; ; GFX1250-LABEL: s_mul_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog %result = mul i32 %num, %den @@ -375,6 +379,7 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d ; ; GFX1250-LABEL: s_mul_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s0, s0, s2 ; GFX1250-NEXT: s_mul_i32 s1, s1, s3 ; GFX1250-NEXT: ; return to shader part epilog @@ -474,6 +479,7 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { ; ; GFX1250-LABEL: s_mul_i33: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] ; GFX1250-NEXT: ; return to shader part epilog %result = mul i33 %num, %den @@ -535,6 +541,7 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; ; GFX1250-LABEL: s_mul_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] ; GFX1250-NEXT: ; return to shader part epilog %result = mul i64 %num, %den @@ -712,6 +719,7 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) { ; ; GFX1250-LABEL: s_mul_i96: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s6, s0, s5 ; GFX1250-NEXT: s_mul_i32 s7, s1, s4 ; GFX1250-NEXT: s_mul_i32 s2, s2, s3 @@ -1027,6 +1035,7 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) { ; ; GFX1250-LABEL: s_mul_i128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s9, s0, s6 ; GFX1250-NEXT: s_mul_i32 s11, s1, s5 ; GFX1250-NEXT: s_mul_hi_u32 s10, s0, s6 @@ -2218,6 +2227,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; ; GFX1250-LABEL: s_mul_i256: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s17, s0, s10 ; GFX1250-NEXT: s_mul_i32 s19, s1, s9 ; GFX1250-NEXT: s_mul_hi_u32 s18, s0, s10 @@ -2997,6 +3007,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: s_mul_u64_zext_with_vregs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0 @@ -3101,6 +3112,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX1250-LABEL: s_mul_u64_zext_with_sregs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3191,6 +3203,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: s_mul_u64_sext_with_vregs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0 @@ -3310,6 +3323,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; ; GFX1250-LABEL: s_mul_u64_sext_with_sregs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll index c55137574a9a4..a1513663fb331 100644 --- a/llvm/test/CodeGen/AMDGPU/add-max.ll +++ b/llvm/test/CodeGen/AMDGPU/add-max.ll @@ -5,6 +5,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_u32_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) @@ -16,6 +17,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_u32_svv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) @@ -27,6 +29,7 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { ; GCN-LABEL: add_max_u32_ssv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_max_u32 v0, s0, s1, v0 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) @@ -38,6 +41,7 @@ define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) { ; SDAG-LABEL: add_max_u32_sss: ; SDAG: ; %bb.0: +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_max_u32_e32 v0, s2, v0 @@ -45,6 +49,7 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c ; ; GISEL-LABEL: add_max_u32_sss: ; GISEL: ; %bb.0: +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_add_max_u32 v0, s0, s1, v0 @@ -58,6 +63,7 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { ; GCN-LABEL: add_max_u32_vsi: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) @@ -69,6 +75,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { ; GCN-LABEL: add_max_u32_svl: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) @@ -80,6 +87,7 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_u32_slv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) @@ -91,6 +99,7 @@ define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_max_i32_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b) @@ -102,6 +111,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_min_u32_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) @@ -113,6 +123,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: add_min_i32_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b) @@ -124,6 +135,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { ; GCN-LABEL: add_max_v2u16_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) @@ -135,6 +147,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x i16> %c) { ; GCN-LABEL: add_max_v2u16_svv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) @@ -146,6 +159,7 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) { ; GCN-LABEL: add_max_v2u16_ssv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) @@ -157,6 +171,7 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) { ; SDAG-LABEL: add_max_v2u16_sss: ; SDAG: ; %bb.0: +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_pk_max_u16 v0, v0, s2 @@ -164,6 +179,7 @@ define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b ; ; GISEL-LABEL: add_max_v2u16_sss: ; GISEL: ; %bb.0: +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 @@ -177,6 +193,7 @@ define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) { ; GCN-LABEL: add_max_v2u16_vsi: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) @@ -188,6 +205,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) { define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) { ; GCN-LABEL: add_max_v2u16_svl: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) @@ -199,6 +217,7 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) { define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) { ; GCN-LABEL: add_max_v2u16_slv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> ) @@ -210,6 +229,7 @@ define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) { define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { ; GCN-LABEL: add_max_v2s16_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b) @@ -221,6 +241,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { ; GCN-LABEL: add_min_v2u16_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) @@ -232,6 +253,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { ; GCN-LABEL: add_min_v2s16_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b) diff --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll index 22acedc4d6e25..bd43737ca5def 100644 --- a/llvm/test/CodeGen/AMDGPU/add_u64.ll +++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll @@ -12,6 +12,7 @@ define amdgpu_ps <2 x float> @test_add_u64_vv(i64 %a, i64 %b) { ; ; GFX1250-LABEL: test_add_u64_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, %b @@ -29,6 +30,7 @@ define amdgpu_ps <2 x float> @test_add_u64_vs(i64 %a, i64 inreg %b) { ; ; GFX1250-LABEL: test_add_u64_vs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, %b @@ -46,6 +48,7 @@ define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) { ; ; GFX1250-LABEL: test_add_u64_sv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, %b @@ -54,12 +57,20 @@ define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) { } define amdgpu_ps <2 x float> @test_add_u64_ss(i64 inreg %a, i64 inreg %b) { -; GCN-LABEL: test_add_u64_ss: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX12-LABEL: test_add_u64_ss: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_ss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, %b %ret = bitcast i64 %add to <2 x float> ret <2 x float> %ret @@ -75,6 +86,7 @@ define amdgpu_ps <2 x float> @test_add_u64_v_inline_lit(i64 %a) { ; ; GFX1250-LABEL: test_add_u64_v_inline_lit: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 5, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, 5 @@ -92,6 +104,7 @@ define amdgpu_ps <2 x float> @test_add_u64_v_small_imm(i64 %a) { ; ; GFX1250-LABEL: test_add_u64_v_small_imm: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x1f4, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, 500 @@ -109,6 +122,7 @@ define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) { ; ; GFX1250-LABEL: test_add_u64_v_64bit_imm: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x13b9ac9ff, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, 5294967295 @@ -117,13 +131,23 @@ define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) { } define amdgpu_ps <2 x float> @test_add_u64_s_small_imm(i64 inreg %a) { -; GCN-LABEL: test_add_u64_s_small_imm: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x1f4 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX12-LABEL: test_add_u64_s_small_imm: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x1f4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_add_u64_s_small_imm: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x1f4 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, 500 %ret = bitcast i64 %add to <2 x float> ret <2 x float> %ret } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll index b486fabb19497..e5992e398ddbd 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll @@ -9,6 +9,7 @@ target triple = "amdgcn-amd-amdhsa" define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -26,6 +27,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 @@ -53,6 +55,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspace(5) %ptr) { ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast_nonnull: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) @@ -66,6 +69,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa ; ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo @@ -87,6 +91,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) { ; GFX1250-LABEL: use_flat_to_private_addrspacecast: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -104,6 +109,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) { define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; GFX1250-SDAG-LABEL: use_flat_to_private_addrspacecast_nonnull: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -114,6 +120,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; ; GFX1250-GISEL-LABEL: use_flat_to_private_addrspacecast_nonnull: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 9bc1986719df8..b78678bbfc9f6 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -13,6 +13,7 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { ; ; GFX1250-LABEL: v_test_cvt_bf16_f32_v: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1250-NEXT: ; return to shader part epilog %cvt = fpext bfloat %v to float @@ -28,6 +29,7 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { ; ; GFX1250-LABEL: v_test_cvt_bf16_f32_s: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshl_b32 s0, s0, 16 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -63,6 +65,7 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { ; ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX1250-NEXT: ; return to shader part epilog %res = fptrunc <2 x float> %src to <2 x bfloat> @@ -101,6 +104,7 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { ; ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog %res = fptrunc <2 x float> %src to <2 x bfloat> @@ -129,6 +133,7 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { ; ; GFX1250-LABEL: v_test_cvt_f32_bf16_v: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -205,6 +210,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; ; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_f64_e32 v8, v[2:3] ; GFX1250-NEXT: v_cvt_f32_f64_e32 v9, v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -264,6 +270,7 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { ; ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX1250-NEXT: ; return to shader part epilog entry: @@ -304,6 +311,7 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { ; ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1| ; GFX1250-NEXT: ; return to shader part epilog entry: @@ -342,6 +350,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { ; ; GFX1250-LABEL: fptrunc_f32_to_bf16: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX1250-NEXT: flat_store_b16 v[2:3], v0 @@ -378,6 +387,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { ; ; GFX1250-LABEL: fptrunc_f32_to_bf16_abs: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0 ; GFX1250-NEXT: flat_store_b16 v[2:3], v0 @@ -415,6 +425,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { ; ; GFX1250-LABEL: fptrunc_f32_to_bf16_neg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0 ; GFX1250-NEXT: flat_store_b16 v[2:3], v0 @@ -467,6 +478,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; ; GFX1250-LABEL: fptrunc_f64_to_bf16: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 @@ -529,6 +541,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; ; GFX1250-LABEL: fptrunc_f64_to_bf16_neg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 @@ -593,6 +606,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; ; GFX1250-LABEL: fptrunc_f64_to_bf16_abs: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 39618b05e6c71..94213096b9d0c 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -5,6 +5,7 @@ define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) { ; GCN-LABEL: llvm_sqrt_bf16_v: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_sqrt_bf16_e32 v2, v2 ; GCN-NEXT: global_store_b16 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -16,6 +17,7 @@ define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) { define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { ; GCN-LABEL: llvm_sqrt_bf16_s: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_sqrt_bf16_e32 v2, s0 ; GCN-NEXT: global_store_b16 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -27,6 +29,7 @@ define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src define amdgpu_ps void @v_test_add_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_test_add_v2bf16_vv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, v2, v3 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -38,6 +41,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_add_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_add_v2bf16_vs: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -49,6 +53,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_add_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_add_v2bf16_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, s0, s1 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -60,6 +65,7 @@ define amdgpu_ps void @v_test_add_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_add_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_add_v2bf16_vc: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, v2, 2.0 op_sel_hi:[1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -71,6 +77,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_add_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_add_v2bf16_vl: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, 0x42c83f80, v2 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -82,6 +89,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_sub_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_test_sub_v2bf16_vv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, v2, v3 neg_lo:[0,1] neg_hi:[0,1] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -93,6 +101,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_sub_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_sub_v2bf16_vs: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, v2, s0 neg_lo:[0,1] neg_hi:[0,1] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -104,6 +113,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_sub_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_sub_v2bf16_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, s0, s1 neg_lo:[0,1] neg_hi:[0,1] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -115,6 +125,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_sub_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_sub_v2bf16_vc: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, v2, -2.0 op_sel_hi:[1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -126,6 +137,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_sub_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_sub_v2bf16_vl: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, 0xc2c8bf80, v2 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -137,6 +149,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_sub_v2bf16_lv(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_sub_v2bf16_lv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, 0x42c83f80, v2 neg_lo:[0,1] neg_hi:[0,1] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -148,6 +161,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_lv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_sub_v2bf16_iv(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_sub_v2bf16_iv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_add_bf16 v2, v2, 1.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -159,6 +173,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_iv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_mul_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_test_mul_v2bf16_vv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -170,6 +185,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_mul_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_mul_v2bf16_vs: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -181,6 +197,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_mul_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_mul_v2bf16_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -194,6 +211,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_mul_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_mul_v2bf16_vc: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_mul_bf16 v2, v2, 0.5 op_sel_hi:[1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -205,6 +223,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_mul_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_mul_v2bf16_vl: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -216,6 +235,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_min_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_test_min_v2bf16_vv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_min_num_bf16 v2, v2, v3 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -227,6 +247,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_min_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_min_v2bf16_vs: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_min_num_bf16 v2, v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -238,6 +259,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_min_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_min_v2bf16_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_min_num_bf16 v2, s0, s1 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -249,6 +271,7 @@ define amdgpu_ps void @v_test_min_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_min_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_min_v2bf16_vc: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_min_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -260,6 +283,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_min_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_min_v2bf16_vl: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_min_num_bf16 v2, 0x42c83f80, v2 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -271,6 +295,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_max_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-LABEL: v_test_max_v2bf16_vv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v2, v2, v3 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -282,6 +307,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_max_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_max_v2bf16_vs: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v2, v2, s0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -293,6 +319,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_max_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_max_v2bf16_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v2, s0, s1 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -304,6 +331,7 @@ define amdgpu_ps void @v_test_max_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_max_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_max_v2bf16_vc: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -315,6 +343,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_max_v2bf16_vl: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v2, 0x42c83f80, v2 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -326,6 +355,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) { ; GCN-LABEL: test_clamp_bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GCN-NEXT: ; return to shader part epilog %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0) @@ -336,6 +366,7 @@ define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) { define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) { ; GCN-LABEL: test_clamp_bf16_s: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp ; GCN-NEXT: ; return to shader part epilog %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0) @@ -346,6 +377,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) { define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) { ; GCN-LABEL: test_clamp_v2bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GCN-NEXT: ; return to shader part epilog %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> ) @@ -357,6 +389,7 @@ define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) { define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { ; GCN-LABEL: test_clamp_v2bf16_s: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp ; GCN-NEXT: ; return to shader part epilog %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> ) @@ -368,6 +401,7 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { ; GCN-LABEL: test_clamp_bf16_folding: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_exp_bf16_e32 v0, v0 ; GCN-NEXT: v_nop ; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) @@ -382,6 +416,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) { ; GCN-LABEL: test_clamp_v2bf16_folding: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp ; GCN-NEXT: ; return to shader part epilog %mul = fmul <2 x bfloat> %src0, %src1 @@ -394,6 +429,7 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; GCN-LABEL: v_test_mul_add_v2bf16_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -406,6 +442,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfl define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) { ; GCN-LABEL: v_test_mul_add_v2bf16_vss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -418,6 +455,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfl define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) { ; GCN-LABEL: v_test_mul_add_v2bf16_sss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2 @@ -432,6 +470,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfl define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_mul_add_v2bf16_vsc: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -444,6 +483,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfl define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_mul_add_v2bf16_vll: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_mov_b32 s0, 0x43484000 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80, v2, s0 @@ -458,6 +498,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfl define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; GCN-LABEL: v_test_fma_v2bf16_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -469,6 +510,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_fma_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) { ; GCN-LABEL: v_test_fma_v2bf16_vss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1 ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -480,6 +522,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_fma_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) { ; GCN-LABEL: v_test_fma_v2bf16_sss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2 @@ -493,6 +536,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_fma_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) { ; GCN-LABEL: v_test_fma_v2bf16_vsc: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0] ; GCN-NEXT: global_store_b32 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -504,6 +548,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) { ; GCN-LABEL: v_test_fma_v2bf16_vll: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_mov_b32 s0, 0x42c83f80 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0x43484000 @@ -517,6 +562,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) { ; GCN-LABEL: llvm_log2_bf16_v: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_log_bf16_e32 v2, v2 ; GCN-NEXT: global_store_b16 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -528,6 +574,7 @@ define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) { define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { ; GCN-LABEL: llvm_log2_bf16_s: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_log_bf16_e32 v2, s0 ; GCN-NEXT: global_store_b16 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -539,6 +586,7 @@ define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) { ; GCN-LABEL: llvm_exp2_bf16_v: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_exp_bf16_e32 v2, v2 ; GCN-NEXT: global_store_b16 v[0:1], v2, off ; GCN-NEXT: s_endpgm @@ -550,6 +598,7 @@ define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) { define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { ; GCN-LABEL: llvm_exp2_bf16_s: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_exp_bf16_e32 v2, s0 ; GCN-NEXT: global_store_b16 v[0:1], v2, off ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 28d7e6916e519..ab9cd8e037734 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -21317,6 +21317,7 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) { ; ; GFX1250-LABEL: s_fabs_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s0, 0x7fff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 @@ -21433,6 +21434,7 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) { ; ; GFX1250-LABEL: s_fneg_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_xor_b32 s0, s0, 0x8000 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 @@ -21554,6 +21556,7 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { ; ; GFX1250-LABEL: s_fneg_fabs_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_bitset1_b32 s0, 15 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 @@ -44236,6 +44239,7 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; ; GFX1250TRUE16-LABEL: s_select_bf16: ; GFX1250TRUE16: ; %bb.0: +; GFX1250TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX1250TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 @@ -44246,6 +44250,7 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; ; GFX1250FAKE16-LABEL: s_select_bf16: ; GFX1250FAKE16: ; %bb.0: +; GFX1250FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250FAKE16-NEXT: v_mov_b32_e32 v1, s0 ; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -44392,6 +44397,7 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX1250TRUE16-LABEL: s_select_v2bf16: ; GFX1250TRUE16: ; %bb.0: +; GFX1250TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250TRUE16-NEXT: s_lshr_b32 s2, s0, 16 ; GFX1250TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1250TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 @@ -44406,6 +44412,7 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX1250FAKE16-LABEL: s_select_v2bf16: ; GFX1250FAKE16: ; %bb.0: +; GFX1250FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250FAKE16-NEXT: s_lshr_b32 s2, s0, 16 ; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1250FAKE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 @@ -44558,6 +44565,7 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX1250TRUE16-LABEL: s_vselect_v2bf16: ; GFX1250TRUE16: ; %bb.0: +; GFX1250TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250TRUE16-NEXT: s_lshr_b32 s3, s0, 16 ; GFX1250TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1250TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 @@ -44573,6 +44581,7 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX1250FAKE16-LABEL: s_vselect_v2bf16: ; GFX1250FAKE16: ; %bb.0: +; GFX1250FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250FAKE16-NEXT: s_lshr_b32 s2, s0, 16 ; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1250FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0 @@ -46190,6 +46199,7 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; ; GFX1250-LABEL: s_select_v3bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -46330,6 +46340,7 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX1250-LABEL: s_select_v4bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -46576,6 +46587,7 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX1250TRUE16-LABEL: s_vselect_v4bf16: ; GFX1250TRUE16: ; %bb.0: +; GFX1250TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250TRUE16-NEXT: s_lshr_b32 s7, s1, 16 ; GFX1250TRUE16-NEXT: s_lshr_b32 s9, s0, 16 ; GFX1250TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -46599,6 +46611,7 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX1250FAKE16-LABEL: s_vselect_v4bf16: ; GFX1250FAKE16: ; %bb.0: +; GFX1250FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250FAKE16-NEXT: s_lshr_b32 s4, s1, 16 ; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1250FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll index 478460595b5b7..c85702021adc9 100644 --- a/llvm/test/CodeGen/AMDGPU/bitop3.ll +++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll @@ -9,10 +9,16 @@ ; ========= Single bit functions ========= define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: not_and_not_and_not_and: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:1 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: not_and_not_and_not_and: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:1 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: not_and_not_and_not_and: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:1 +; GFX1250-NEXT: ; return to shader part epilog %nota = xor i32 %a, -1 %notb = xor i32 %b, -1 %notc = xor i32 %c, -1 @@ -23,10 +29,16 @@ define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: not_and_not_and_and: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:2 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: not_and_not_and_and: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:2 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: not_and_not_and_and: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:2 +; GFX1250-NEXT: ; return to shader part epilog %nota = xor i32 %a, -1 %notb = xor i32 %b, -1 %and1 = and i32 %nota, %c @@ -36,10 +48,16 @@ define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps float @not_and_and_not_and(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: not_and_and_not_and: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:4 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: not_and_and_not_and: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:4 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: not_and_and_not_and: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:4 +; GFX1250-NEXT: ; return to shader part epilog %nota = xor i32 %a, -1 %notc = xor i32 %c, -1 %and1 = and i32 %nota, %notc @@ -62,11 +80,13 @@ define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-SDAG-LABEL: not_and_and_and: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:8 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: not_and_and_and: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 @@ -79,10 +99,16 @@ define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps float @and_not_and_not_and(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: and_not_and_not_and: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: and_not_and_not_and: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: and_not_and_not_and: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10 +; GFX1250-NEXT: ; return to shader part epilog %notb = xor i32 %b, -1 %notc = xor i32 %c, -1 %and1 = and i32 %a, %notc @@ -105,11 +131,13 @@ define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-SDAG-LABEL: and_not_and_and: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x20 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: and_not_and_and: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v1, 0, v0 @@ -135,11 +163,13 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-SDAG-LABEL: and_and_not_and: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x40 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: and_and_not_and: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:0x30 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 @@ -152,10 +182,16 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: and_and_and: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: and_and_and: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: and_and_and: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80 +; GFX1250-NEXT: ; return to shader part epilog %and1 = and i32 %a, %c %and2 = and i32 %and1, %b %ret_cast = bitcast i32 %and2 to float @@ -165,10 +201,16 @@ define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) { ; ========= Multi bit functions ========= define amdgpu_ps float @test_12(i32 %a, i32 %b) { -; GCN-LABEL: test_12: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: test_12: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_12: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc +; GFX1250-NEXT: ; return to shader part epilog %nota = xor i32 %a, -1 %and1 = and i32 %nota, %b %ret_cast = bitcast i32 %and1 to float @@ -189,11 +231,13 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) { ; ; GFX1250-SDAG-LABEL: test_63: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: test_63: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_not_b32_e32 v1, v1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_bfi_b32 v0, v0, v1, -1 @@ -206,10 +250,16 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) { } define amdgpu_ps float @test_59(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: test_59: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: test_59: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_59: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b +; GFX1250-NEXT: ; return to shader part epilog %nota = xor i32 %a, -1 %notb = xor i32 %b, -1 %and1 = and i32 %nota, %c @@ -233,11 +283,13 @@ define amdgpu_ps float @test_126(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-SDAG-LABEL: test_126: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v0, v2, v1 bitop3:0x7e ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: test_126: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v0, v1 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -271,11 +323,13 @@ define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-SDAG-LABEL: test_12_src_overflow: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: test_12_src_overflow: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_not_b32_e32 v3, v0 ; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -317,6 +371,7 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-SDAG-LABEL: test_100_src_overflow: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v3, v1, v2, v0 bitop3:0x10 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v4, v0, v2, v1 bitop3:0x40 ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v1, v2, v0 bitop3:0x20 @@ -326,6 +381,7 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-GISEL-LABEL: test_100_src_overflow: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v2, v0 ; GFX1250-GISEL-NEXT: v_bitop3_b32 v4, v0, v1, v0 bitop3:0x30 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v1, v0 @@ -362,6 +418,7 @@ define amdgpu_ps float @test_xor3(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-LABEL: test_xor3: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_xor3_b32 v0, v0, v1, v2 ; GFX1250-NEXT: ; return to shader part epilog %xor1 = xor i32 %a, %b @@ -371,10 +428,16 @@ define amdgpu_ps float @test_xor3(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps float @test_or3(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: test_or3: -; GCN: ; %bb.0: -; GCN-NEXT: v_or3_b32 v0, v0, v1, v2 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: test_or3: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_or3: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX1250-NEXT: ; return to shader part epilog %or1 = or i32 %a, %b %or2 = or i32 %or1, %c %ret_cast = bitcast i32 %or2 to float @@ -382,10 +445,16 @@ define amdgpu_ps float @test_or3(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps float @test_and_or(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: test_and_or: -; GCN: ; %bb.0: -; GCN-NEXT: v_and_or_b32 v0, v0, v1, v2 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: test_and_or: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_and_or: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX1250-NEXT: ; return to shader part epilog %and1 = and i32 %a, %b %or1 = or i32 %and1, %c %ret_cast = bitcast i32 %or1 to float @@ -404,6 +473,7 @@ define amdgpu_ps float @uniform_3_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) { ; ; GFX1250-LABEL: uniform_3_op: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_not1_b32 s0, s2, s0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b32 s0, s0, s1 @@ -433,6 +503,7 @@ define amdgpu_ps float @uniform_4_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) { ; ; GFX1250-SDAG-LABEL: uniform_4_op: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, s0, s1, v0 bitop3:2 @@ -440,6 +511,7 @@ define amdgpu_ps float @uniform_4_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) { ; ; GFX1250-GISEL-LABEL: uniform_4_op: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_and_not1_b32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_and_not1_b32 s0, s0, s1 @@ -463,21 +535,25 @@ define amdgpu_ps half @not_and_not_and_not_and_b16(i16 %a, i16 %b, i16 %c) { ; ; GFX1250-SDAG-FAKE16-LABEL: not_and_not_and_not_and_b16: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:1 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: not_and_not_and_not_and_b16: ; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:1 ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-FAKE16-LABEL: not_and_not_and_not_and_b16: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:1 ; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-TRUE16-LABEL: not_and_not_and_not_and_b16: ; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:1 ; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog %nota = xor i16 %a, -1 @@ -497,21 +573,25 @@ define amdgpu_ps half @not_and_not_and_and_b16(i16 %a, i16 %b, i16 %c) { ; ; GFX1250-SDAG-FAKE16-LABEL: not_and_not_and_and_b16: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:2 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: not_and_not_and_and_b16: ; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:2 ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-FAKE16-LABEL: not_and_not_and_and_b16: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:2 ; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-TRUE16-LABEL: not_and_not_and_and_b16: ; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:2 ; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog %nota = xor i16 %a, -1 @@ -530,21 +610,25 @@ define amdgpu_ps half @not_and_and_not_and_b16(i16 %a, i16 %b, i16 %c) { ; ; GFX1250-SDAG-FAKE16-LABEL: not_and_and_not_and_b16: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:4 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: not_and_and_not_and_b16: ; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:4 ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-FAKE16-LABEL: not_and_and_not_and_b16: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:4 ; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-TRUE16-LABEL: not_and_and_not_and_b16: ; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:4 ; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog %nota = xor i16 %a, -1 @@ -569,16 +653,19 @@ define amdgpu_ps half @test_xor3_b16(i16 %a, i16 %b, i16 %c) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_xor3_b16: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0x96 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: test_xor3_b16: ; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, v1.l bitop3:0x96 ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: test_xor3_b16: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_xor3_b32 v0, v0, v1, v2 ; GFX1250-GISEL-NEXT: ; return to shader part epilog %xor1 = xor i16 %a, %b @@ -600,16 +687,19 @@ define amdgpu_ps half @test_or3_b16(i16 %a, i16 %b, i16 %c) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_or3_b16: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0xfe ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: test_or3_b16: ; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, v1.l bitop3:0xfe ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: test_or3_b16: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX1250-GISEL-NEXT: ; return to shader part epilog %or1 = or i16 %a, %b @@ -631,16 +721,19 @@ define amdgpu_ps half @test_and_or_b16(i16 %a, i16 %b, i16 %c) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_and_or_b16: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0xec ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-TRUE16-LABEL: test_and_or_b16: ; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, v1.l bitop3:0xec ; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: test_and_or_b16: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, v0, v1, v2 ; GFX1250-GISEL-NEXT: ; return to shader part epilog %and1 = and i16 %a, %b @@ -649,5 +742,6 @@ define amdgpu_ps half @test_and_or_b16(i16 %a, i16 %b, i16 %c) { ret half %ret_cast } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} ; GFX1250-FAKE16: {{.*}} ; GFX1250-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 31307b245bafe..f784e0df723e9 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -22,6 +22,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_max_short_forward_branch: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s0, 0 @@ -62,6 +63,7 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s0, 0 @@ -104,6 +106,7 @@ bb3: define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 { ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch: ; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_f32 s0, 0 @@ -146,6 +149,7 @@ bb3: define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: min_long_forward_vbranch: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -198,6 +202,7 @@ bb3: define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { ; GCN-LABEL: long_backward_sbranch: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: .LBB4_1: ; %bb2 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -238,6 +243,7 @@ bb3: define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_forward_branch: ; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s0, 0 @@ -299,6 +305,7 @@ bb4: define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr addrspace(1) %arg, i32 %arg1) { ; GCN-LABEL: uniform_unconditional_min_long_backward_branch: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_mov_b32 vcc_lo, exec_lo ; GCN-NEXT: .LBB6_1: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -334,6 +341,7 @@ loop: define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GCN-LABEL: expand_requires_expand: ; GCN: ; %bb.0: ; %bb0 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_lt_i32 s0, 0 @@ -401,6 +409,7 @@ bb3: define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) #0 { ; GCN-LABEL: uniform_inside_divergent: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: s_mov_b32 s3, exec_lo ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -450,6 +459,7 @@ endif: define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-LABEL: analyze_mask_branch: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_mov_b32 s0, exec_lo ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_mov_b32_e64 v0, 0 @@ -518,6 +528,7 @@ ret: define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 { ; GCN-LABEL: long_branch_hang: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GCN-NEXT: s_mov_b32 s7, -1 ; GCN-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 7dbbeaabeb715..ef5438e63f667 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -39,6 +39,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: kernel: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -83,11 +84,15 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { ; ; GFX1250-TRUE16-LABEL: ps_ret_cc_f16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: ps_ret_cc_f16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 @@ -119,6 +124,8 @@ define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) { ; ; GFX1250-LABEL: ps_ret_cc_inreg_f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: s_add_f16 s0, s0, 1.0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -277,6 +284,8 @@ define amdgpu_kernel void @call_coldcc() #0 { ; ; GFX1250-LABEL: call_coldcc: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s32, 0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1250-NEXT: s_get_pc_i64 s[6:7] ; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], coldcc@gotpcrel+4 @@ -286,7 +295,6 @@ define amdgpu_kernel void @call_coldcc() #0 { ; GFX1250-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1250-NEXT: s_mov_b32 s32, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[12:13] ; GFX1250-NEXT: global_store_b32 v[0:1], v0, off @@ -388,6 +396,8 @@ define amdgpu_kernel void @call_fastcc() #0 { ; ; GFX1250-LABEL: call_fastcc: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s32, 0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1250-NEXT: s_get_pc_i64 s[6:7] ; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], fastcc@gotpcrel+4 @@ -397,7 +407,6 @@ define amdgpu_kernel void @call_fastcc() #0 { ; GFX1250-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1250-NEXT: s_mov_b32 s32, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[12:13] ; GFX1250-NEXT: global_store_b32 v[0:1], v0, off @@ -433,11 +442,15 @@ define amdgpu_cs half @cs_mesa(half %arg0) { ; ; GFX1250-TRUE16-LABEL: cs_mesa: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: cs_mesa: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 @@ -470,11 +483,15 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) { ; ; GFX1250-TRUE16-LABEL: ps_mesa_f16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: ps_mesa_f16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 @@ -507,11 +524,15 @@ define amdgpu_vs half @vs_mesa(half %arg0) { ; ; GFX1250-TRUE16-LABEL: vs_mesa: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: vs_mesa: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 @@ -544,11 +565,15 @@ define amdgpu_gs half @gs_mesa(half %arg0) { ; ; GFX1250-TRUE16-LABEL: gs_mesa: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: gs_mesa: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 @@ -581,11 +606,15 @@ define amdgpu_hs half @hs_mesa(half %arg0) { ; ; GFX1250-TRUE16-LABEL: hs_mesa: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: hs_mesa: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 @@ -620,6 +649,8 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v2f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] ; GFX1250-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, @@ -654,6 +685,8 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v2f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] ; GFX1250-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, @@ -690,6 +723,7 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX1250-NEXT: global_store_b32 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm @@ -731,6 +765,7 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0] ; GFX1250-NEXT: global_store_b32 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm @@ -777,6 +812,8 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v4f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] ; GFX1250-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX1250-NEXT: ; return to shader part epilog @@ -824,6 +861,8 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v4f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] ; GFX1250-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0] ; GFX1250-NEXT: ; return to shader part epilog @@ -871,6 +910,7 @@ define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_co_i32 s2, s2, 3 ; GFX1250-NEXT: s_add_co_i32 s0, s0, 1 ; GFX1250-NEXT: s_add_co_i32 s1, s1, 2 @@ -914,6 +954,7 @@ define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v3f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_f32 s0, s0, 1.0 ; GFX1250-NEXT: s_add_f32 s1, s1, 2.0 ; GFX1250-NEXT: s_add_f32 s2, s2, 4.0 @@ -980,6 +1021,7 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v5i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_co_i32 s3, s3, 4 ; GFX1250-NEXT: s_add_co_i32 s2, s2, 3 ; GFX1250-NEXT: s_add_co_i32 s1, s1, 2 @@ -1037,6 +1079,7 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_v5f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_f32 s3, s3, -1.0 ; GFX1250-NEXT: s_add_f32 s4, s4, 0.5 ; GFX1250-NEXT: s_add_f32 s0, s0, 1.0 @@ -1085,6 +1128,7 @@ define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_add_nc_u32 v2, 3, v2 :: v_dual_add_nc_u32 v1, 2, v1 ; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off @@ -1123,6 +1167,7 @@ define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v3f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1 ; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off @@ -1171,6 +1216,7 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v5i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2 ; GFX1250-NEXT: v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4 ; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0 @@ -1220,6 +1266,7 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { ; ; GFX1250-LABEL: ps_mesa_v5f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2 ; GFX1250-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4 ; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0 @@ -1261,12 +1308,14 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) { ; ; GFX1250-TRUE16-LABEL: ps_mesa_i16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.l ; GFX1250-TRUE16-NEXT: global_store_b16 v[0:1], v0, off ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: ps_mesa_i16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, v0, v0 ; GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v0, off ; GFX1250-FAKE16-NEXT: s_endpgm @@ -1302,6 +1351,7 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; ; GFX1250-LABEL: ps_mesa_inreg_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -1313,10 +1363,26 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { } define amdgpu_ps i16 @ret_ps_mesa_i16() { -; GCN-LABEL: ret_ps_mesa_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_movk_i32 s0, 0x7b -; GCN-NEXT: ; return to shader part epilog +; SI-LABEL: ret_ps_mesa_i16: +; SI: ; %bb.0: +; SI-NEXT: s_movk_i32 s0, 0x7b +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ret_ps_mesa_i16: +; VI: ; %bb.0: +; VI-NEXT: s_movk_i32 s0, 0x7b +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ret_ps_mesa_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_movk_i32 s0, 0x7b +; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: ret_ps_mesa_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_movk_i32 s0, 0x7b +; GFX1250-NEXT: ; return to shader part epilog ret i16 123 } @@ -1353,6 +1419,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { ; ; GFX1250-LABEL: amd_kernel_i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 @@ -1418,6 +1485,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v2i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1519,6 +1587,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v4i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1617,6 +1686,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v3i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 2 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0 @@ -1737,6 +1807,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v5i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0 @@ -1902,6 +1973,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v8i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2179,6 +2251,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v16i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[4:5], 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2700,6 +2773,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v32i8: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[8:9], 16 ; GFX1250-NEXT: v_mov_b64_e32 v[10:11], 0 @@ -2869,6 +2943,7 @@ define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) { ; ; GFX1250-LABEL: amdgpu_cs_i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1250-NEXT: global_store_b8 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm @@ -2993,6 +3068,7 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) { ; ; GFX1250-TRUE16-LABEL: amdgpu_cs_v8i1: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l ; GFX1250-TRUE16-NEXT: v_and_b16 v3.l, v6.l, 1 ; GFX1250-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 1 @@ -3018,6 +3094,7 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) { ; ; GFX1250-FAKE16-LABEL: amdgpu_cs_v8i1: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7 @@ -3258,6 +3335,7 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) { ; ; GFX1250-TRUE16-LABEL: amdgpu_cs_v16i1: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_and_b16 v2.h, v6.l, 1 ; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 3, v7.l ; GFX1250-TRUE16-NEXT: v_and_b16 v4.h, v10.l, 1 @@ -3302,6 +3380,7 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) { ; ; GFX1250-FAKE16-LABEL: amdgpu_cs_v16i1: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7 ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 @@ -3742,6 +3821,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; ; GFX1250-TRUE16-LABEL: amdgpu_cs_v32i1: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v19.l ; GFX1250-TRUE16-NEXT: v_and_b16 v19.l, v22.l, 1 ; GFX1250-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1 @@ -3819,6 +3899,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; ; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 3, v3 @@ -3930,6 +4011,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) { ; ; GFX1250-LABEL: amdgpu_cs_inreg_i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s0, 1 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -4026,6 +4108,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) { ; ; GFX1250-LABEL: amdgpu_cs_inreg_v8i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s6, s6, 1 ; GFX1250-NEXT: s_lshl_b32 s5, s5, 1 ; GFX1250-NEXT: s_and_b32 s4, s4, 1 @@ -4214,6 +4297,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) { ; ; GFX1250-LABEL: amdgpu_cs_inreg_v16i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s10, s10, 1 ; GFX1250-NEXT: s_lshl_b32 s9, s9, 1 ; GFX1250-NEXT: s_and_b32 s8, s8, 1 @@ -4570,6 +4654,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) { ; ; GFX1250-LABEL: amdgpu_cs_inreg_v32i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s10, s10, 1 ; GFX1250-NEXT: s_lshl_b32 s9, s9, 1 ; GFX1250-NEXT: s_and_b32 s8, s8, 1 @@ -4694,6 +4779,7 @@ define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) { ; ; GFX1250-LABEL: amdgpu_cs_i1_sext: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1250-NEXT: global_store_b8 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm @@ -4721,6 +4807,7 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) { ; ; GFX1250-LABEL: amdgpu_cs_i1_zext: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_store_b8 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm store i1 %arg0, ptr addrspace(1) poison @@ -4728,3 +4815,5 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) { } attributes #0 = { nounwind noinline } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 8d05317162e9c..a8ef781b62a42 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -117,6 +117,7 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX1250-LABEL: sadd64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -228,6 +229,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1250-LABEL: sadd64ri: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -329,6 +331,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1250-LABEL: vadd64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 @@ -430,6 +433,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: vadd64ri: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 @@ -536,6 +540,7 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: suaddo32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 @@ -668,6 +673,7 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX1250-LABEL: uaddo32_vcc_user: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -813,6 +819,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: suaddo64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_u32 s0, s12, s14 @@ -955,6 +962,7 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: vuaddo64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1086,6 +1094,7 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX1250-LABEL: ssub64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -1197,6 +1206,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; ; GFX1250-LABEL: ssub64ri: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1298,6 +1308,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; ; GFX1250-LABEL: vsub64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 @@ -1399,6 +1410,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: vsub64ri: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 @@ -1506,6 +1518,7 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: susubo32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 @@ -1638,6 +1651,7 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; ; GFX1250-LABEL: usubo32_vcc_user: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1783,6 +1797,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: susubo64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14 @@ -1925,6 +1940,7 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: vusubo64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -3041,6 +3057,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GFX1250-LABEL: sudiv64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll index 79b44d6a92caa..7ccf26f7eaad3 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll @@ -581,6 +581,7 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) { ; ; GFX1250-LABEL: s_fmaak_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; encoding: [0x41,0x06,0x80,0xb9,0x01,0x00,0x00,0x00] ; GFX1250-NEXT: s_fmaak_f32 s0, s0, s1, 0x43800000 ; encoding: [0x00,0x01,0x80,0xa2,0x00,0x00,0x80,0x43] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; encoding: [0x0b,0x00,0x87,0xbf] ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e] @@ -594,7 +595,7 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) { ; GFX1100: codeLenInByte = 16 ; GFX1150: codeLenInByte = 16 ; GFX1200: codeLenInByte = 16 -; GFX1250: codeLenInByte = 16 +; GFX1250: codeLenInByte = 24 define double @v_mul_f64_vop2_literal_32(double %x) { ; GFX9-LABEL: v_mul_f64_vop2_literal_32: diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll index 23d2b18f5311b..8116078c9ee71 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll @@ -10,6 +10,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -35,6 +36,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32_max_offset: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -60,6 +62,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32_too_far: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -86,6 +89,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32_x2: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -127,6 +131,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32_x2_barrier: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -174,6 +179,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32_x2_nonzero_base: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) @@ -219,6 +225,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; GFX1250-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -251,6 +258,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 { ; GFX1250-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -283,6 +291,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: read2_ptr_is_subreg_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -314,6 +323,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32_volatile_0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -340,6 +350,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f32_volatile_1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -367,6 +378,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; GFX1250-LABEL: unaligned_read2_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -393,6 +405,7 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; GFX1250-LABEL: unaligned_offset_read2_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -419,6 +432,7 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; GFX1250-LABEL: misaligned_2_simple_read2_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -445,12 +459,14 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_b32_e32 v4, 0x1ff8, v0 ; GFX1250-NEXT: ds_load_2addr_b64 v[0:3], v4 offset1:8 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] @@ -470,12 +486,14 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f64_max_offset: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_b32_e32 v4, 0x1ff8, v0 ; GFX1250-NEXT: ds_load_2addr_b64 v[0:3], v4 offset1:255 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] @@ -495,6 +513,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX1250-LABEL: simple_read2_f64_too_far: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -502,6 +521,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { ; GFX1250-NEXT: ds_load_b64 v[0:1], v4 ; GFX1250-NEXT: ds_load_b64 v[2:3], v4 offset:2056 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] @@ -522,6 +542,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 { define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { ; GFX1250-LABEL: misaligned_read2_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -531,6 +552,7 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs ; GFX1250-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 ; GFX1250-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:14 offset1:15 ; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm @@ -551,6 +573,7 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) { ; GFX1250-LABEL: load_constant_adjacent_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: ds_load_b64 v[0:1], v2 @@ -569,6 +592,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) { ; GFX1250-LABEL: load_constant_disjoint_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:2 @@ -589,6 +613,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) %out) { ; GFX1250-LABEL: load_misaligned64_constant_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: ds_load_b128 v[0:3], v4 @@ -609,6 +634,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) % define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspace(1) %out) { ; GFX1250-LABEL: load_misaligned64_constant_large_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: ds_load_b64 v[0:1], v4 offset:16384 @@ -631,6 +657,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 { ; GFX1250-LABEL: sgemm_inner_loop_read2_sequence: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-NEXT: s_and_b32 s1, ttmp6, 15 ; GFX1250-NEXT: s_add_co_i32 s0, s0, 1 @@ -719,6 +746,7 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; GFX1250-LABEL: misaligned_read2_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 @@ -734,6 +762,7 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 { ; GFX1250-LABEL: misaligned_read2_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 @@ -749,6 +778,7 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @ds_read_diff_base_interleaving( ; GFX1250-LABEL: ds_read_diff_base_interleaving: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: v_dual_lshrrev_b32 v1, 6, v0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -815,6 +845,8 @@ bb: define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) { ; GFX1250-LABEL: ds_read_call_read: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s32, 0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[36:38], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-NEXT: v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v31, v0 @@ -824,7 +856,6 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1250-NEXT: s_mov_b32 s32, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_lshl_add_u32 v40, v1, 2, s38 ; GFX1250-NEXT: ds_load_b32 v41, v40 @@ -850,6 +881,7 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) { ; GFX1250-LABEL: read2_v2i32_align1_odd_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: ds_load_b64 v[0:1], v2 offset:65 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 22a26f373927e..1684437eff580 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -34,6 +34,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX1250-LABEL: simple_write2_one_val_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -85,6 +86,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad ; ; GFX1250-LABEL: simple_write2_two_val_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -145,6 +147,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; ; GFX1250-LABEL: simple_write2_two_val_f32_volatile_0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -206,6 +209,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; ; GFX1250-LABEL: simple_write2_two_val_f32_volatile_1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -271,6 +275,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace ; ; GFX1250-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -325,6 +330,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C ; ; GFX1250-LABEL: simple_write2_two_val_subreg2_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -376,6 +382,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C ; ; GFX1250-LABEL: simple_write2_two_val_subreg4_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -428,6 +435,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) ; ; GFX1250-LABEL: simple_write2_two_val_max_offset_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -488,6 +496,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; ; GFX1250-LABEL: simple_write2_two_val_too_far_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -548,6 +557,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; ; GFX1250-LABEL: simple_write2_two_val_f32_x2: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -619,6 +629,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; ; GFX1250-LABEL: simple_write2_two_val_f32_x2_nonzero_base: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -698,6 +709,7 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C ; ; GFX1250-LABEL: write2_ptr_subreg_arg_two_val_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -761,6 +773,7 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX1250-LABEL: simple_write2_one_val_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -814,6 +827,7 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) ; ; GFX1250-LABEL: misaligned_simple_write2_one_val_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -913,6 +927,7 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; ; GFX1250-LABEL: unaligned_offset_simple_write2_one_val_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -967,6 +982,7 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad ; ; GFX1250-LABEL: simple_write2_two_val_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1013,6 +1029,7 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() { ; ; GFX1250-LABEL: store_constant_adjacent_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x7b0000007b ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: ds_store_b64 v2, v[0:1] @@ -1040,6 +1057,7 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() { ; ; GFX1250-LABEL: store_constant_disjoint_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1250-NEXT: ds_store_2addr_b32 v1, v0, v0 offset1:2 ; GFX1250-NEXT: s_endpgm @@ -1072,6 +1090,7 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() { ; ; GFX1250-LABEL: store_misaligned64_constant_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 @@ -1108,6 +1127,7 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { ; ; GFX1250-LABEL: store_misaligned64_constant_large_offsets: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x7b ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:16384 @@ -1167,6 +1187,7 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, ; ; GFX1250-LABEL: write2_sgemm_sequence: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x10 ; GFX1250-NEXT: s_and_b32 s2, ttmp6, 15 ; GFX1250-NEXT: s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4) @@ -1283,6 +1304,7 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX1250-LABEL: simple_write2_v4f32_superreg_align4: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x8 ; GFX1250-NEXT: s_load_b32 s8, s[4:5], 0x0 @@ -1351,6 +1373,7 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { ; ; GFX1250-LABEL: write2_v2i32_align1_odd_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x1c80000007b ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:65 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll index d747fb7cce7dc..b23645839944e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll @@ -18,6 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -31,6 +32,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_var_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -51,6 +53,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) # define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { ; GFX1250-LABEL: s_test_canonicalize_var_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -86,6 +89,7 @@ define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -110,6 +114,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %o define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -134,6 +139,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace( define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -157,6 +163,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %o define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 { ; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -180,6 +187,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr ad define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 { ; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -204,6 +212,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(p define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_p0_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -217,6 +226,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_n0_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -230,6 +240,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_p1_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -243,6 +254,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_n1_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -256,6 +268,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_literal_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -269,6 +282,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -282,6 +296,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 { ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -295,6 +310,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr a define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -308,6 +324,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 { ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -321,6 +338,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr a define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -334,6 +352,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -347,6 +366,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrs define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -360,6 +380,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrs define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -373,6 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -386,6 +408,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -399,6 +422,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -412,6 +436,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_var_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -437,6 +462,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -467,6 +493,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -498,6 +525,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspac define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -528,6 +556,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { ; GFX1250-LABEL: s_test_canonicalize_var_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -548,6 +577,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -561,6 +591,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -574,6 +605,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -587,6 +619,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -600,6 +633,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %ou define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -613,6 +647,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -626,6 +661,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16( define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #3 { ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -639,6 +675,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -652,6 +689,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16( define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #3 { ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -665,6 +703,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -678,6 +717,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) % define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -691,6 +731,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr add define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -704,6 +745,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr add define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -717,6 +759,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspa define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -730,6 +773,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspa define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -743,6 +787,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspa define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -793,6 +838,7 @@ define <4 x bfloat> @v_test_canonicalize_var_v4bf16(<4 x bfloat> %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -928,6 +974,7 @@ define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 { define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %out) #1 { ; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll index 213233e802a96..29aad9c5f9dc1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll @@ -13,6 +13,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg1, ptr nocapture %arg2) { ; GCN-LABEL: test_move_load_address_to_vgpr: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 0c2d51dc9b0c2..6ae3cfb7e106a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -10,6 +10,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV @@ -49,6 +50,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV @@ -89,6 +91,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV @@ -134,6 +137,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -174,6 +178,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -215,6 +220,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -269,6 +275,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -284,6 +291,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -336,6 +344,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -351,6 +360,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -404,6 +414,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -418,6 +429,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -468,6 +480,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -482,6 +495,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -540,6 +554,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -584,6 +599,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -717,6 +733,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -764,6 +781,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -907,6 +925,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -943,6 +962,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -1051,6 +1071,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -1089,6 +1110,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -1212,6 +1234,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -1252,6 +1275,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -1299,6 +1323,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV @@ -1337,6 +1362,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV @@ -1382,6 +1408,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1426,6 +1453,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1563,6 +1591,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -1610,6 +1639,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1757,6 +1787,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -1796,6 +1827,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -1915,6 +1947,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -1956,6 +1989,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -2090,6 +2124,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -2130,6 +2165,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -2177,6 +2213,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV @@ -2215,6 +2252,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV @@ -2260,6 +2298,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2304,6 +2343,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2443,6 +2483,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -2490,6 +2531,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2639,6 +2681,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -2678,6 +2721,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -2799,6 +2843,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -2840,6 +2885,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -2976,6 +3022,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -3016,6 +3063,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -3063,6 +3111,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV @@ -3101,6 +3150,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV @@ -3146,6 +3196,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3191,6 +3242,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3329,6 +3381,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -3377,6 +3430,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3525,6 +3579,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -3565,6 +3620,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -3685,6 +3741,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -3727,6 +3784,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -3862,6 +3920,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -3902,6 +3961,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -3949,6 +4009,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV @@ -3987,6 +4048,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV @@ -4032,6 +4094,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4077,6 +4140,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4215,6 +4279,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -4263,6 +4328,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4411,6 +4477,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -4451,6 +4518,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -4571,6 +4639,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -4613,6 +4682,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -4748,6 +4818,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4788,6 +4859,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4835,6 +4907,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV @@ -4873,6 +4946,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV @@ -4918,6 +4992,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4963,6 +5038,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5101,6 +5177,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -5149,6 +5226,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5297,6 +5375,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -5337,6 +5416,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -5457,6 +5537,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -5499,6 +5580,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -5634,6 +5716,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -5666,6 +5749,7 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -5705,6 +5789,7 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5736,6 +5821,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5774,6 +5860,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5814,6 +5901,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5949,6 +6037,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -5992,6 +6081,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6137,6 +6227,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -6173,6 +6264,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -6289,6 +6381,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -6327,6 +6420,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -6458,6 +6552,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -6490,6 +6585,7 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -6529,6 +6625,7 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6560,6 +6657,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6598,6 +6696,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6638,6 +6737,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6773,6 +6873,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -6816,6 +6917,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6961,6 +7063,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -6997,6 +7100,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -7113,6 +7217,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -7151,6 +7256,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -7282,6 +7388,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -7314,6 +7421,7 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -7353,6 +7461,7 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -7384,6 +7493,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -7422,6 +7532,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7462,6 +7573,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7597,6 +7709,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -7640,6 +7753,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7785,6 +7899,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -7821,6 +7936,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -7937,6 +8053,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -7975,6 +8092,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -8106,6 +8224,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -8138,6 +8257,7 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -8177,6 +8297,7 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -8208,6 +8329,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -8246,6 +8368,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8286,6 +8409,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8421,6 +8545,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -8464,6 +8589,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8609,6 +8735,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -8645,6 +8772,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -8761,6 +8889,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -8799,6 +8928,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -8930,6 +9060,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -8973,6 +9104,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -9023,6 +9155,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -9063,6 +9196,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -9110,6 +9244,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 @@ -9156,6 +9291,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 @@ -9304,6 +9440,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -9353,6 +9490,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 @@ -9511,6 +9649,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 @@ -9552,6 +9691,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -9681,6 +9821,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -9724,6 +9865,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -9868,6 +10010,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -9900,6 +10043,7 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -9939,6 +10083,7 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -9967,6 +10112,7 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -10002,6 +10148,7 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10046,6 +10193,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10191,6 +10339,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -10238,6 +10387,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10393,6 +10543,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -10430,6 +10581,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -10551,6 +10703,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -10590,6 +10743,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -10727,6 +10881,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -10759,6 +10914,7 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -10798,6 +10954,7 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -10826,6 +10983,7 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -10861,6 +11019,7 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10908,6 +11067,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11056,6 +11216,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -11106,6 +11267,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11264,6 +11426,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo @@ -11304,6 +11467,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo @@ -11428,6 +11592,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 @@ -11470,6 +11635,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 90e0b194ae74d..8b7c49b5931af 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -15,6 +15,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) { ; GFX1250-LABEL: flat_load_saddr_i8_offset_0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -29,6 +30,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) { define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) { ; GFX1250-LABEL: flat_load_saddr_i8_offset_8388607: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -44,6 +46,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) { define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) { ; GFX1250-LABEL: flat_load_saddr_i8_offset_8388608: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -59,6 +62,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) { define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) { ; GFX1250-LABEL: flat_load_saddr_i8_offset_neg8388608: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388608 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -74,6 +78,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) { define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg8388609: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 @@ -83,6 +88,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg8388609: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xff7fffff ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -93,6 +99,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg8388609: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 @@ -109,6 +116,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0 @@ -118,6 +126,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -128,6 +137,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0 @@ -144,6 +154,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000000: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, 1 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] @@ -152,6 +163,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000000: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -162,6 +174,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100000000: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NOECC-NEXT: s_add_co_i32 s3, s3, 1 ; GFX1250-NOECC-NEXT: flat_load_u8 v0, v0, s[2:3] @@ -177,6 +190,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000001: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 @@ -186,6 +200,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000001: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 1 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -196,6 +211,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100000001: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 @@ -212,6 +228,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000FFF: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 @@ -221,6 +238,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000FFF: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -231,6 +249,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100000FFF: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 @@ -247,6 +266,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100001000: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 @@ -256,6 +276,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100001000: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -266,6 +287,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100001000: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 @@ -282,6 +304,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800000, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 @@ -291,6 +314,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 1 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -301,6 +325,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_add_co_u32 v0, s0, 0x800000, s2 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 @@ -317,6 +342,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000000: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, -1 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] @@ -325,6 +351,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbas ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000000: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -335,6 +362,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbas ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg0x100000000: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NOECC-NEXT: s_add_co_i32 s3, s3, -1 ; GFX1250-NOECC-NEXT: flat_load_u8 v0, v0, s[2:3] @@ -350,6 +378,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbas define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbase) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000001: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 @@ -359,6 +388,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbas ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000001: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -369,6 +399,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbas ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg0x100000001: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_add_co_u32 v0, s0, 0, s2 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 @@ -390,6 +421,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbas define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -405,6 +437,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voff define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -421,6 +454,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %s define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -433,6 +467,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 @@ -446,6 +481,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NOECC-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -468,6 +504,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388608: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388608 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -484,6 +521,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388607: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388607 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -499,6 +537,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -515,6 +554,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(pt define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -532,6 +572,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase, define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -549,6 +590,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr in define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -567,6 +609,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_of define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -591,6 +634,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_of define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v1 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -602,6 +646,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; ; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -614,6 +659,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { ; ; GFX1250-NOECC-LABEL: flat_load_saddr_uniform_ptr_in_vgprs: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NOECC-NEXT: ds_load_b64 v[2:3], v1 ; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0 @@ -635,6 +681,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v1 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -646,6 +693,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff ; ; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -658,6 +706,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff ; ; GFX1250-NOECC-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NOECC-NEXT: ds_load_b64 v[2:3], v1 ; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0 @@ -680,6 +729,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -687,6 +737,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 @@ -696,6 +747,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_uniform_offset: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-NOECC-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -712,6 +764,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -719,6 +772,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 @@ -728,6 +782,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-NOECC-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -745,6 +800,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -752,6 +808,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 @@ -761,6 +818,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-NOECC-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -779,6 +837,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -786,6 +845,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_of ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 @@ -795,6 +855,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_of ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, s4 ; GFX1250-NOECC-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -814,6 +875,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_of define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffset) { ; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -823,6 +885,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse ; ; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_mov_b32 s3, 0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -835,6 +898,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse ; ; GFX1250-NOECC-LABEL: flat_load_i8_vgpr64_sgpr32: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: s_mov_b32 s3, 0 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NOECC-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -853,6 +917,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i32 inreg %soffset) { ; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -862,6 +927,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3 ; ; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_mov_b32 s3, 0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -874,6 +940,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3 ; ; GFX1250-NOECC-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: s_mov_b32 s3, 0 ; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NOECC-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -897,6 +964,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3 define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) { ; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset @@ -913,6 +981,7 @@ define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inreg %sbase, ptr %voffset.ptr) { ; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing_immoffset: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:128 @@ -930,6 +999,7 @@ define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inr define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, ptr %voffset.ptr) { ; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset @@ -946,6 +1016,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, pt define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg %sbase, ptr %voffset.ptr) { ; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_imm_offset: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 scale_offset @@ -963,6 +1034,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) { ; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset @@ -982,24 +1054,28 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: +; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: flat_load_d16_b16 v0, v0, s[2:3] ; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16: ; GFX1250-NOECC-SDAG-FAKE16: ; %bb.0: +; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: ; return to shader part epilog @@ -1013,24 +1089,28 @@ define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) { define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_immneg128: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: +; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: flat_load_d16_b16 v0, v0, s[2:3] offset:-128 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16_immneg128: ; GFX1250-NOECC-SDAG-FAKE16: ; %bb.0: +; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: ; return to shader part epilog @@ -1045,24 +1125,28 @@ define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voff define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_f16: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_f16: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_f16: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: +; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: flat_load_d16_b16 v0, v0, s[2:3] ; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_f16: ; GFX1250-NOECC-SDAG-FAKE16: ; %bb.0: +; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: ; return to shader part epilog @@ -1075,24 +1159,28 @@ define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) { define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_f16_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_f16_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_f16_immneg128: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: +; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: flat_load_d16_b16 v0, v0, s[2:3] offset:-128 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_f16_immneg128: ; GFX1250-NOECC-SDAG-FAKE16: ; %bb.0: +; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: ; return to shader part epilog @@ -1106,6 +1194,7 @@ define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voff define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1119,6 +1208,7 @@ define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) { define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1133,6 +1223,7 @@ define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %vof define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1145,6 +1236,7 @@ define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) { define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_f32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1158,6 +1250,7 @@ define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %vof define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1171,6 +1264,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffse define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2i16_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1185,6 +1279,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1197,6 +1292,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffse define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2f16_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1210,6 +1306,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_p3: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1224,6 +1321,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset) define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_p3_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1239,6 +1337,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1252,6 +1351,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_f64_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1266,6 +1366,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i3 define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1279,6 +1380,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i64_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1293,6 +1395,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i3 define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1305,6 +1408,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffs define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2f32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1318,6 +1422,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1331,6 +1436,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffs define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2i32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1345,6 +1451,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1358,6 +1465,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffs define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4i16_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1372,6 +1480,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1385,6 +1494,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffs define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4f16_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1399,6 +1509,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_p1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1413,6 +1524,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_p1_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1428,6 +1540,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v3f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1440,6 +1553,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffs define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v3f32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1453,6 +1567,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase, define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1466,6 +1581,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffs define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v3i32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1480,6 +1596,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase, define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v6f16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1492,6 +1609,7 @@ define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffse define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v6f16_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1505,6 +1623,7 @@ define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1517,6 +1636,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffs define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4f32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1530,6 +1650,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase, define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1543,6 +1664,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffs define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4i32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1557,6 +1679,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase, define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1570,6 +1693,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffs define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2i64_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1584,6 +1708,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase, define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1597,6 +1722,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffse define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_i128_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1611,6 +1737,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2p1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1625,6 +1752,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffse define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v2p1_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1640,6 +1768,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4p3: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1654,6 +1783,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffse define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_load_saddr_v4p3_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1673,6 +1803,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_sextload_saddr_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_i8 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1687,6 +1818,7 @@ define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_sextload_saddr_i8_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1702,6 +1834,7 @@ define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 % define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_sextload_saddr_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_i16 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1716,6 +1849,7 @@ define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset) define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_sextload_saddr_i16_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_i16 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1731,6 +1865,7 @@ define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32 define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_zextload_saddr_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1745,6 +1880,7 @@ define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_zextload_saddr_i8_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1760,6 +1896,7 @@ define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 % define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_zextload_saddr_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1774,6 +1911,7 @@ define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset) define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: flat_zextload_saddr_i16_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -1793,6 +1931,7 @@ define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32 define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: atomic_flat_load_saddr_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1808,6 +1947,7 @@ define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffse define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: atomic_flat_load_saddr_i32_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1824,6 +1964,7 @@ define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: atomic_flat_load_saddr_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1839,6 +1980,7 @@ define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 % define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-LABEL: atomic_flat_load_saddr_i64_immneg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1859,18 +2001,21 @@ define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sb define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_undef_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_undef_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_undef_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_b16 v0, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: ; return to shader part epilog @@ -1885,18 +2030,21 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_b16 v0, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: ; return to shader part epilog @@ -1912,6 +2060,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr in define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zero_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1919,6 +2068,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zero_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1926,6 +2076,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zero_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NOECC-NEXT: flat_load_d16_b16 v1, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1942,6 +2093,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1949,6 +2101,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inr ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1956,6 +2109,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inr ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NOECC-NEXT: flat_load_d16_b16 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1973,6 +2127,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inr define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 @@ -1980,6 +2135,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 @@ -1987,6 +2143,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_reg_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_b16 v1, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2002,6 +2159,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 @@ -2009,6 +2167,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 @@ -2016,6 +2175,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_b16 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2032,6 +2192,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 @@ -2039,6 +2200,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg % ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 @@ -2046,6 +2208,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg % ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_u8 v1, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2062,6 +2225,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg % define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 @@ -2069,6 +2233,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 @@ -2076,6 +2241,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_u8 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2093,6 +2259,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 @@ -2100,6 +2267,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg % ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2109,6 +2277,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg % ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_i8 v1, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2125,6 +2294,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg % define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 @@ -2132,6 +2302,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2141,6 +2312,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_i8 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2162,6 +2334,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_undef_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2169,6 +2342,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_undef_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2176,6 +2350,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_undef_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_b16 v0, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: ; return to shader part epilog @@ -2190,6 +2365,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2197,6 +2373,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2204,6 +2381,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: ; return to shader part epilog @@ -2219,6 +2397,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 @@ -2226,6 +2405,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2233,6 +2413,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: +; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: flat_load_d16_hi_b16 v1, v0, s[2:3] ; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2241,6 +2422,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, ; ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi: ; GFX1250-NOECC-SDAG-FAKE16: ; %bb.0: +; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: flat_load_d16_hi_b16 v1, v0, s[2:3] ; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2257,6 +2439,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 @@ -2264,6 +2447,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2271,6 +2455,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: +; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: flat_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2279,6 +2464,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr ; ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: ; GFX1250-NOECC-SDAG-FAKE16: ; %bb.0: +; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: flat_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2296,6 +2482,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 @@ -2303,6 +2490,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2312,6 +2500,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_reg_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_b16 v1, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2327,6 +2516,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 @@ -2334,6 +2524,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2343,6 +2534,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2359,6 +2551,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 @@ -2366,6 +2559,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg % ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2375,6 +2569,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg % ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_u8 v1, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2391,6 +2586,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg % define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 @@ -2398,6 +2594,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -2407,6 +2604,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2424,6 +2622,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 @@ -2431,6 +2630,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg % ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2441,6 +2641,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg % ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_i8 v1, v0, s[2:3] ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2457,6 +2658,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg % define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 @@ -2464,6 +2666,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -2474,6 +2677,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: flat_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 ; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, v1 @@ -2496,6 +2700,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) { ; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_bitop2_b32 v0, 16, v0 bitop3:0x54 ; GFX1250-NEXT: flat_load_u8 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2512,6 +2717,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) { ; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_or_b32_e32 v0, 0x1040, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: flat_load_u8 v0, v[0:1] @@ -2533,6 +2739,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr add define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 @@ -2550,6 +2757,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -2569,6 +2777,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; ; GFX1250-NOECC-LABEL: flat_addr_64bit_lsr_iv: ; GFX1250-NOECC: ; %bb.0: ; %bb +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-NOECC-NEXT: .LBB116_1: ; %bb3 @@ -2604,6 +2813,7 @@ bb3: ; preds = %bb3, %bb define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inreg %arg.1) { ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 @@ -2624,6 +2834,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] @@ -2646,6 +2857,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; ; GFX1250-NOECC-LABEL: flat_addr_64bit_lsr_iv_multiload: ; GFX1250-NOECC: ; %bb.0: ; %bb +; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-NOECC-NEXT: .LBB117_1: ; %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll index 3d0e2875e91a2..78ed084493c4b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll @@ -7,6 +7,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { ; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] @@ -22,6 +23,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voff define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { ; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_2047: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:2047 @@ -38,6 +40,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbas define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { ; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:-2048 @@ -60,6 +63,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %s define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -70,6 +74,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %d ; ; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -89,6 +94,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %d define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -99,6 +105,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff ; ; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -122,6 +129,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, i16 %data) { ; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -133,6 +141,7 @@ define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i16 %data) { ; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -145,6 +154,7 @@ define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %s define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, half %data) { ; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -156,6 +166,7 @@ define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, half %data) { ; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -168,6 +179,7 @@ define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %s define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -179,6 +191,7 @@ define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -191,6 +204,7 @@ define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %s define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, float %data) { ; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -202,6 +216,7 @@ define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, float %data) { ; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -214,6 +229,7 @@ define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %s define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) { ; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -225,6 +241,7 @@ define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voff define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) { ; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -237,12 +254,14 @@ define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sb define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_endpgm @@ -255,12 +274,14 @@ define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_endpgm @@ -274,12 +295,14 @@ define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %s define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, double %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_endpgm @@ -292,12 +315,14 @@ define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, double %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_endpgm @@ -311,12 +336,14 @@ define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %s define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_endpgm @@ -329,12 +356,14 @@ define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_endpgm @@ -348,12 +377,14 @@ define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x float> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_endpgm @@ -366,12 +397,14 @@ define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x float> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_endpgm @@ -385,12 +418,14 @@ define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_endpgm @@ -403,12 +438,14 @@ define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_endpgm @@ -422,12 +459,14 @@ define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x half> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_endpgm @@ -440,12 +479,14 @@ define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x half> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_endpgm @@ -459,12 +500,14 @@ define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_endpgm @@ -477,12 +520,14 @@ define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voff define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_endpgm @@ -496,6 +541,7 @@ define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sb define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] @@ -503,6 +549,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] @@ -516,6 +563,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 @@ -523,6 +571,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 @@ -537,6 +586,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x float> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] @@ -544,6 +594,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] @@ -557,6 +608,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x float> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 @@ -564,6 +616,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 @@ -578,6 +631,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] @@ -585,6 +639,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] @@ -598,6 +653,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 @@ -605,6 +661,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 @@ -619,6 +676,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x half> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] @@ -626,6 +684,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] @@ -639,6 +698,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x half> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 @@ -646,6 +706,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 @@ -660,6 +721,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -667,6 +729,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -680,6 +743,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -687,6 +751,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -701,6 +766,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x float> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -708,6 +774,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -721,6 +788,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x float> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -728,6 +796,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -742,6 +811,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -749,6 +819,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -762,6 +833,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -769,6 +841,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -783,6 +856,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x double> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -790,6 +864,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -803,6 +878,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x double> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -810,6 +886,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -824,6 +901,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -831,6 +909,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -844,6 +923,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -851,6 +931,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -865,6 +946,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x half> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -872,6 +954,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -885,6 +968,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %v define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x half> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -892,6 +976,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -906,6 +991,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -913,6 +999,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -926,6 +1013,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %vo define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -933,6 +1021,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg % ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -947,6 +1036,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg % define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] @@ -954,6 +1044,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] @@ -967,6 +1058,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %vo define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) { ; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 @@ -974,6 +1066,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg % ; ; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 @@ -992,6 +1085,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg % define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS @@ -1005,6 +1099,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS @@ -1019,6 +1114,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr i define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1027,6 +1123,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 @@ -1041,6 +1138,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 @@ -1049,6 +1147,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr i ; ; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 @@ -1068,6 +1167,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr i define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1080,6 +1180,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i3 define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1093,6 +1194,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr in define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1106,6 +1208,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %s define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 5e6de6d66ccc1..ab9c7f16d54ac 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -162,6 +162,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmax3_olt_0_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 @@ -352,6 +353,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmax3_olt_1_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 @@ -609,6 +611,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -639,6 +642,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -897,6 +901,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -927,6 +932,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 2756472652bc9..7c2aeeb90bd9e 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -162,6 +162,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_0_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 @@ -352,6 +353,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_1_f32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 @@ -609,6 +611,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -639,6 +642,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -897,6 +901,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -927,6 +932,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -1217,6 +1223,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_0_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 @@ -1426,6 +1433,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_1_f64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s2, -1 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 9e5a4428b011f..66d859fbd66ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -34,6 +34,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; ; GFX1250-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS @@ -75,6 +76,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; ; GFX1250-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS @@ -153,6 +155,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; ; GFX1250-LABEL: local_atomic_fadd_v2f16_noret: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 @@ -220,6 +223,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX1250-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 831af7b6c10ba..d1db407cdbe07 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -42,6 +42,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -72,6 +73,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -115,6 +117,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -161,6 +164,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -191,6 +195,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -234,6 +239,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -280,6 +286,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -310,6 +317,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -353,6 +361,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -398,6 +407,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace( ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -428,6 +438,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -471,6 +482,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -516,6 +528,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -546,6 +559,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -589,6 +603,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -635,6 +650,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -665,6 +681,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -708,6 +725,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -754,6 +772,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -784,6 +803,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -827,6 +847,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -872,6 +893,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace( ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -902,6 +924,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -945,6 +968,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -990,6 +1014,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1020,6 +1045,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub ; ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1063,6 +1089,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1109,6 +1136,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1139,6 +1167,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1182,6 +1211,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1228,6 +1258,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1258,6 +1289,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d ; ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1301,6 +1333,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1346,6 +1379,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace( ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1376,6 +1410,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] @@ -1419,6 +1454,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1468,6 +1504,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1510,6 +1547,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1554,6 +1592,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1596,6 +1635,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1767,6 +1807,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1811,6 +1852,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1853,6 +1895,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -1897,6 +1940,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -2070,6 +2114,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -2110,6 +2155,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do ; ; GFX1250-LABEL: local_atomic_fadd_f64_noret: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c @@ -2181,6 +2227,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2217,6 +2264,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2253,6 +2301,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index c28b25c76d241..657e35a90a9a5 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -199,6 +199,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -211,12 +212,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; GFX1250-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null ; GFX1250-SDAG-TRUE16-NEXT: s_endpgm ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -229,18 +232,21 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; GFX1250-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null ; GFX1250-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 @@ -250,12 +256,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 @@ -454,6 +462,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r, ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_afn: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -466,12 +475,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r, ; GFX1250-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null ; GFX1250-SDAG-TRUE16-NEXT: s_endpgm ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_afn: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -484,18 +495,21 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r, ; GFX1250-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null ; GFX1250-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_afn: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 @@ -505,12 +519,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r, ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_afn: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 @@ -1216,6 +1232,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -1283,6 +1300,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -1350,6 +1368,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -1405,6 +1424,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -1665,6 +1685,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -1678,6 +1699,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -1685,6 +1707,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -1698,6 +1721,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -1705,6 +1729,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -1712,6 +1737,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; GFX1250-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -1719,6 +1745,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -1726,6 +1753,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -1950,6 +1978,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_v2f32_to_v2f16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -1968,6 +1997,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_v2f32_to_v2f16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -1986,6 +2016,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_v2f32_to_v2f16: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -2000,6 +2031,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_v2f32_to_v2f16: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -3235,6 +3267,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -3351,6 +3384,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -3467,6 +3501,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 @@ -3566,6 +3601,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 @@ -3910,6 +3946,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -3931,6 +3968,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -3952,6 +3990,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 @@ -3961,6 +4000,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX1250-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 @@ -3971,6 +4011,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 @@ -3980,6 +4021,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -4179,6 +4221,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX1250-SDAG-TRUE16-LABEL: fneg_fptrunc_f32_to_f16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -4192,6 +4235,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -4199,6 +4243,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX1250-SDAG-FAKE16-LABEL: fneg_fptrunc_f32_to_f16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -4212,6 +4257,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -4219,6 +4265,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX1250-GISEL-TRUE16-LABEL: fneg_fptrunc_f32_to_f16: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -4226,8 +4273,9 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_xor_b32 s2, s2, 0x80000000 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -4235,6 +4283,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; ; GFX1250-GISEL-FAKE16-LABEL: fneg_fptrunc_f32_to_f16: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -4242,8 +4291,9 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_xor_b32 s2, s2, 0x80000000 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -4441,6 +4491,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX1250-SDAG-TRUE16-LABEL: fabs_fptrunc_f32_to_f16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -4454,6 +4505,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -4461,6 +4513,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX1250-SDAG-FAKE16-LABEL: fabs_fptrunc_f32_to_f16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -4474,6 +4527,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -4481,6 +4535,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX1250-GISEL-TRUE16-LABEL: fabs_fptrunc_f32_to_f16: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -4488,8 +4543,9 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_bitset0_b32 s2, 31 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -4497,6 +4553,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; ; GFX1250-GISEL-FAKE16-LABEL: fabs_fptrunc_f32_to_f16: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -4504,8 +4561,9 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_bitset0_b32 s2, 31 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -4703,6 +4761,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX1250-SDAG-TRUE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -4716,6 +4775,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -4723,6 +4783,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX1250-SDAG-FAKE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -4736,6 +4797,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null @@ -4743,6 +4805,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX1250-GISEL-TRUE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -4750,8 +4813,9 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_bitset1_b32 s2, 31 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -4759,6 +4823,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; ; GFX1250-GISEL-FAKE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -4766,8 +4831,9 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_bitset1_b32 s2, 31 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -4973,6 +5039,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -4985,6 +5052,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX1250-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -4993,6 +5061,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -5005,6 +5074,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX1250-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5013,12 +5083,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 @@ -5029,12 +5101,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 @@ -5243,6 +5317,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -5256,6 +5331,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5264,6 +5340,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -5277,6 +5354,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5285,6 +5363,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -5292,10 +5371,10 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_bitset0_b32 s2, 31 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -5303,6 +5382,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -5310,10 +5390,10 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_bitset0_b32 s2, 31 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1250-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], null @@ -5529,6 +5609,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -5541,6 +5622,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; GFX1250-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -5549,6 +5631,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -5561,6 +5644,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; GFX1250-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX1250-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -5569,12 +5653,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX1250-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-TRUE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-TRUE16-NEXT: s_sext_i32_i16 s2, s2 @@ -5585,12 +5671,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX1250-GISEL-FAKE16-NEXT: s_cvt_f16_f32 s2, s2 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-FAKE16-NEXT: s_sext_i32_i16 s2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/global-address.ll b/llvm/test/CodeGen/AMDGPU/global-address.ll index bcded5221bf43..f3db48edbb6bb 100644 --- a/llvm/test/CodeGen/AMDGPU/global-address.ll +++ b/llvm/test/CodeGen/AMDGPU/global-address.ll @@ -27,8 +27,10 @@ define amdgpu_kernel void @caller_internal() { ; ; GFX1250-PAL-LABEL: caller_internal: ; GFX1250-PAL: ; %bb.0: -; GFX1250-PAL-NEXT: s_mov_b64 s[0:1], internal_func@abs64 ; GFX1250-PAL-NEXT: s_mov_b32 s32, 0 +; GFX1250-PAL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-PAL-NEXT: s_mov_b64 s[0:1], internal_func@abs64 +; GFX1250-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-PAL-NEXT: s_swap_pc_i64 s[30:31], s[0:1] ; GFX1250-PAL-NEXT: s_endpgm ; @@ -45,9 +47,10 @@ define amdgpu_kernel void @caller_internal() { ; ; GFX1250-HSA-LABEL: caller_internal: ; GFX1250-HSA: ; %bb.0: +; GFX1250-HSA-NEXT: s_mov_b32 s32, 0 +; GFX1250-HSA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-HSA-NEXT: s_get_pc_i64 s[0:1] ; GFX1250-HSA-NEXT: s_add_nc_u64 s[0:1], s[0:1], internal_func@gotpcrel+4 -; GFX1250-HSA-NEXT: s_mov_b32 s32, 0 ; GFX1250-HSA-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX1250-HSA-NEXT: s_wait_kmcnt 0x0 ; GFX1250-HSA-NEXT: s_swap_pc_i64 s[30:31], s[0:1] @@ -91,13 +94,14 @@ define amdgpu_kernel void @caller_exterinal() { ; ; GFX1250-PAL-LABEL: caller_exterinal: ; GFX1250-PAL: ; %bb.0: +; GFX1250-PAL-NEXT: s_mov_b32 s32, 0 +; GFX1250-PAL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-PAL-NEXT: v_mov_b32_e32 v31, v0 ; GFX1250-PAL-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1250-PAL-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX1250-PAL-NEXT: s_mov_b64 s[12:13], external_func@abs64 ; GFX1250-PAL-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1250-PAL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1250-PAL-NEXT: s_mov_b32 s32, 0 ; GFX1250-PAL-NEXT: s_swap_pc_i64 s[30:31], s[12:13] ; GFX1250-PAL-NEXT: s_endpgm ; @@ -120,6 +124,8 @@ define amdgpu_kernel void @caller_exterinal() { ; ; GFX1250-HSA-LABEL: caller_exterinal: ; GFX1250-HSA: ; %bb.0: +; GFX1250-HSA-NEXT: s_mov_b32 s32, 0 +; GFX1250-HSA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-HSA-NEXT: v_mov_b32_e32 v31, v0 ; GFX1250-HSA-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1250-HSA-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -127,7 +133,6 @@ define amdgpu_kernel void @caller_exterinal() { ; GFX1250-HSA-NEXT: s_add_nc_u64 s[12:13], s[12:13], external_func@rel64+4 ; GFX1250-HSA-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1250-HSA-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1250-HSA-NEXT: s_mov_b32 s32, 0 ; GFX1250-HSA-NEXT: s_swap_pc_i64 s[30:31], s[12:13] ; GFX1250-HSA-NEXT: s_endpgm call void @external_func() diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index e59af2e82671e..4a4fbadc41e46 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -404,6 +404,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2, ptr addrspace(1) %out) { ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel: ; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GCN-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 @@ -431,6 +432,7 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel: ; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GCN-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GCN-GISEL-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index b81fdd36530da..277aac55f7933 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -65,6 +65,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: s_insertelement_v2bf16_0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -139,6 +140,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: s_insertelement_v2bf16_1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -222,6 +224,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: v_insertelement_v2bf16_0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -308,6 +311,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; ; GFX1250-LABEL: v_insertelement_v2bf16_0_inlineimm: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -394,6 +398,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: v_insertelement_v2bf16_1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -480,6 +485,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; ; GFX1250-LABEL: v_insertelement_v2bf16_1_inlineimm: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -591,6 +597,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; ; GFX1250-LABEL: v_insertelement_v2bf16_dynamic_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 @@ -693,6 +700,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: v_insertelement_v4bf16_0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -787,6 +795,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: v_insertelement_v4bf16_1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -883,6 +892,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: v_insertelement_v4bf16_2: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -977,6 +987,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: v_insertelement_v4bf16_3: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -1094,6 +1105,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; ; GFX1250-LABEL: v_insertelement_v4bf16_dynamic_sgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -1194,6 +1206,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; ; GFX1250-LABEL: v_insertelement_v8bf16_3: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -1423,6 +1436,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; ; GFX1250-LABEL: v_insertelement_v8bf16_dynamic: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -1565,6 +1579,7 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; ; GFX1250-LABEL: v_insertelement_v16bf16_3: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -1958,6 +1973,7 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX1250-LABEL: v_insertelement_v16bf16_dynamic: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll index 20b876836082e..3b1b0f5ac9897 100644 --- a/llvm/test/CodeGen/AMDGPU/literal64.ll +++ b/llvm/test/CodeGen/AMDGPU/literal64.ll @@ -5,6 +5,7 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) { ; GCN-LABEL: s_add_u64: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xf12345678 ; GCN-NEXT: ; return to shader part epilog %result = add i64 %a, 64729929336 @@ -14,6 +15,7 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) { define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) { ; GCN-LABEL: v_add_u64: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] ; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm @@ -25,6 +27,7 @@ define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) { define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) { ; GCN-LABEL: s_add_neg_u64: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xfffffff0edcba988 ; GCN-NEXT: ; return to shader part epilog %result = sub i64 %a, 64729929336 @@ -34,6 +37,7 @@ define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) { define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) { ; GCN-LABEL: v_add_neg_u64: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 0xfffffff0edcba988, v[0:1] ; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm @@ -45,6 +49,7 @@ define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) { define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) { ; GCN-LABEL: s_sub_u64: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_sub_nc_u64 s[0:1], 0xf12345678, s[0:1] ; GCN-NEXT: ; return to shader part epilog %result = sub i64 64729929336, %a @@ -54,6 +59,7 @@ define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) { define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) { ; GCN-LABEL: v_sub_u64: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_sub_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] ; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm @@ -140,6 +146,7 @@ define double @rsq_f64() { define amdgpu_ps i64 @s_and_b64(i64 inreg %a) { ; GCN-LABEL: s_and_b64: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], 0xf12345678 ; GCN-NEXT: ; return to shader part epilog %result = and i64 %a, 64729929336 @@ -151,6 +158,7 @@ define amdgpu_ps i64 @s_and_b64(i64 inreg %a) { define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) { ; GCN-SDAG-LABEL: v_and_b64: ; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-SDAG-NEXT: v_and_b32_e32 v1, 15, v1 ; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x12345678, v0 ; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off @@ -158,6 +166,7 @@ define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) { ; ; GCN-GISEL-LABEL: v_and_b64: ; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x12345678, v0 ; GCN-GISEL-NEXT: v_and_b32_e32 v1, 15, v1 ; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off @@ -170,6 +179,8 @@ define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) { define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) { ; GCN-LABEL: v_add_f64_200.1: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GCN-NEXT: v_add_f64_e32 v[0:1], 0x4069033333333333, v[0:1] ; GCN-NEXT: ; return to shader part epilog %add = fadd double %a, 200.1 @@ -182,6 +193,8 @@ define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) { define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) { ; GCN-LABEL: v_add_f64_200.0: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GCN-NEXT: v_add_f64_e32 v[0:1], 0x40690000, v[0:1] ; GCN-NEXT: ; return to shader part epilog %add = fadd double %a, 200.0 @@ -194,6 +207,7 @@ define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) { define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) { ; GCN-SDAG-LABEL: v_lshl_add_u64: ; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0xf12345678 ; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, s[0:1] @@ -201,6 +215,7 @@ define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) { ; ; GCN-GISEL-LABEL: v_lshl_add_u64: ; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3] @@ -216,6 +231,7 @@ define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) { define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { ; GCN-SDAG-LABEL: v_fma_f64: ; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-SDAG-NEXT: v_fmaak_f64 v[4:5], v[0:1], v[2:3], 0x4063233333333333 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 ; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -227,6 +243,7 @@ define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { ; ; GCN-GISEL-LABEL: v_fma_f64: ; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], 0x4063233333333333 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-GISEL-NEXT: v_fmac_f64_e32 v[4:5], v[0:1], v[2:3] @@ -246,15 +263,18 @@ define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) { ; GCN-SDAG-LABEL: v_add_neg_f64: ; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0x4069033333333333 -; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GCN-SDAG-NEXT: v_add_f64_e64 v[0:1], -v[0:1], s[0:1] ; GCN-SDAG-NEXT: ; return to shader part epilog ; ; GCN-GISEL-LABEL: v_add_neg_f64: ; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 +; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-GISEL-NEXT: v_add_f64_e64 v[0:1], -v[0:1], v[2:3] ; GCN-GISEL-NEXT: ; return to shader part epilog @@ -267,6 +287,7 @@ define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) { define amdgpu_ps <2 x float> @v_cndmask(double %a) { ; GCN-SDAG-LABEL: v_cndmask: ; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-SDAG-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] ; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x40632000 ; GCN-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo @@ -276,6 +297,7 @@ define amdgpu_ps <2 x float> @v_cndmask(double %a) { ; ; GCN-GISEL-LABEL: v_cndmask: ; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-GISEL-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] ; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x40690333 ; GCN-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll index c985e76422e97..5b32e1d75cfeb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll @@ -10,20 +10,32 @@ declare i32 @llvm.amdgcn.bitop3.i32(i32, i32, i32, i32) declare i16 @llvm.amdgcn.bitop3.i16(i16, i16, i16, i32) define amdgpu_ps float @bitop3_b32_vvv(i32 %a, i32 %b, i32 %c) { -; GCN-LABEL: bitop3_b32_vvv: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xf -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: bitop3_b32_vvv: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xf +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: bitop3_b32_vvv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0xf +; GFX1250-NEXT: ; return to shader part epilog %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 15) %ret_cast = bitcast i32 %ret to float ret float %ret_cast } define amdgpu_ps float @bitop3_b32_svv(i32 inreg %a, i32 %b, i32 %c) { -; GCN-LABEL: bitop3_b32_svv: -; GCN: ; %bb.0: -; GCN-NEXT: v_bitop3_b32 v0, s0, v0, v1 bitop3:0x10 -; GCN-NEXT: ; return to shader part epilog +; GFX950-LABEL: bitop3_b32_svv: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_bitop3_b32 v0, s0, v0, v1 bitop3:0x10 +; GFX950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: bitop3_b32_svv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_bitop3_b32 v0, s0, v0, v1 bitop3:0x10 +; GFX1250-NEXT: ; return to shader part epilog %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 16) %ret_cast = bitcast i32 %ret to float ret float %ret_cast @@ -38,6 +50,7 @@ define amdgpu_ps float @bitop3_b32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { ; ; GFX1250-LABEL: bitop3_b32_ssv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_bitop3_b32 v0, s0, s1, v0 bitop3:0x11 ; GFX1250-NEXT: ; return to shader part epilog %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 17) @@ -55,6 +68,7 @@ define amdgpu_ps float @bitop3_b32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) ; ; GFX1250-LABEL: bitop3_b32_sss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_bitop3_b32 v0, s0, s1, v0 bitop3:0x12 @@ -79,6 +93,7 @@ define amdgpu_ps float @bitop3_b32_vvi(i32 %a, i32 %b) { ; ; GFX1250-LABEL: bitop3_b32_vvi: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_bitop3_b32 v0, v0, v1, 0x3e8 bitop3:0x13 ; GFX1250-NEXT: ; return to shader part epilog %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 1000, i32 19) @@ -103,6 +118,7 @@ define amdgpu_ps float @bitop3_b32_vii(i32 %a) { ; ; GFX1250-SDAG-LABEL: bitop3_b32_vii: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x7d0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_bitop3_b32 v0, v0, s0, 0x3e8 bitop3:0x14 @@ -110,6 +126,7 @@ define amdgpu_ps float @bitop3_b32_vii(i32 %a) { ; ; GFX1250-GISEL-LABEL: bitop3_b32_vii: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e8 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_bitop3_b32 v0, v0, 0x7d0, v1 bitop3:0x14 @@ -140,6 +157,7 @@ define amdgpu_ps float @bitop3_b32_iii() { ; ; GFX1250-SDAG-LABEL: bitop3_b32_iii: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0x3e8 ; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0xbb8 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -148,6 +166,7 @@ define amdgpu_ps float @bitop3_b32_iii() { ; ; GFX1250-GISEL-LABEL: bitop3_b32_iii: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0x7d0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e8 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -166,11 +185,13 @@ define amdgpu_ps half @bitop3_b16_vvv(i16 %a, i16 %b, i16 %c) { ; ; GFX1250-TRUE16-LABEL: bitop3_b16_vvv: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:0xf ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: bitop3_b16_vvv: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:0xf ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 15) @@ -186,11 +207,13 @@ define amdgpu_ps half @bitop3_b16_svv(i16 inreg %a, i16 %b, i16 %c) { ; ; GFX1250-TRUE16-LABEL: bitop3_b16_svv: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, s0, v0.l, v1.l bitop3:0x10 ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: bitop3_b16_svv: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, s0, v0, v1 bitop3:0x10 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 16) @@ -207,11 +230,13 @@ define amdgpu_ps half @bitop3_b16_ssv(i16 inreg %a, i16 inreg %b, i16 %c) { ; ; GFX1250-TRUE16-LABEL: bitop3_b16_ssv: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, s0, s1, v0.l bitop3:0x11 ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: bitop3_b16_ssv: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, s0, s1, v0 bitop3:0x11 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 17) @@ -229,6 +254,7 @@ define amdgpu_ps half @bitop3_b16_sss(i16 inreg %a, i16 inreg %b, i16 inreg %c) ; ; GFX1250-TRUE16-LABEL: bitop3_b16_sss: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, s0, s1, v0.l bitop3:0x12 @@ -236,6 +262,7 @@ define amdgpu_ps half @bitop3_b16_sss(i16 inreg %a, i16 inreg %b, i16 inreg %c) ; ; GFX1250-FAKE16-LABEL: bitop3_b16_sss: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, s0, s1, v0 bitop3:0x12 @@ -260,11 +287,13 @@ define amdgpu_ps half @bitop3_b16_vvi(i16 %a, i16 %b) { ; ; GFX1250-TRUE16-LABEL: bitop3_b16_vvi: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0x3e8 bitop3:0x13 ; GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-FAKE16-LABEL: bitop3_b16_vvi: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 0x3e8 bitop3:0x13 ; GFX1250-FAKE16-NEXT: ; return to shader part epilog %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 1000, i32 19) @@ -289,6 +318,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) { ; ; GFX1250-SDG-TRUE16-LABEL: bitop3_b16_vii: ; GFX1250-SDG-TRUE16: ; %bb.0: +; GFX1250-SDG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0x7d0 ; GFX1250-SDG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDG-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0x3e8 bitop3:0x14 @@ -296,6 +326,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) { ; ; GFX1250-SDG-FAKE16-LABEL: bitop3_b16_vii: ; GFX1250-SDG-FAKE16: ; %bb.0: +; GFX1250-SDG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDG-FAKE16-NEXT: s_movk_i32 s0, 0x7d0 ; GFX1250-SDG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDG-FAKE16-NEXT: v_bitop3_b16 v0, v0, s0, 0x3e8 bitop3:0x14 @@ -303,6 +334,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) { ; ; GFX1250-GISEL-TRUE16-LABEL: bitop3_b16_vii: ; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x3e8 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, 0x7d0, v0.h bitop3:0x14 @@ -310,6 +342,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: bitop3_b16_vii: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x3e8 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_bitop3_b16 v0, v0, 0x7d0, v1 bitop3:0x14 @@ -339,6 +372,7 @@ define amdgpu_ps half @bitop3_b16_iii() { ; ; GFX1250-SDG-TRUE16-LABEL: bitop3_b16_iii: ; GFX1250-SDG-TRUE16: ; %bb.0: +; GFX1250-SDG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7d0 ; GFX1250-SDG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0xbb8 ; GFX1250-SDG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -347,6 +381,7 @@ define amdgpu_ps half @bitop3_b16_iii() { ; ; GFX1250-SDG-FAKE16-LABEL: bitop3_b16_iii: ; GFX1250-SDG-FAKE16: ; %bb.0: +; GFX1250-SDG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDG-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3e8 ; GFX1250-SDG-FAKE16-NEXT: s_movk_i32 s0, 0xbb8 ; GFX1250-SDG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -355,6 +390,7 @@ define amdgpu_ps half @bitop3_b16_iii() { ; ; GFX1250-GISEL-TRUE16-LABEL: bitop3_b16_iii: ; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7d0 ; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x3e8 ; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -363,6 +399,7 @@ define amdgpu_ps half @bitop3_b16_iii() { ; ; GFX1250-GISEL-FAKE16-LABEL: bitop3_b16_iii: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7d0 ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x3e8 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -372,3 +409,5 @@ define amdgpu_ps half @bitop3_b16_iii() { %ret_cast = bitcast i16 %ret to half ret half %ret_cast } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll index fa97380583798..aa762c8275fdc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll @@ -11,6 +11,7 @@ declare i32 @llvm.amdgcn.cluster.id.z() #0 define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-UNKNOWN-LABEL: test_cluster_id_x: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -87,6 +88,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; CHECK-MESA3D-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -95,6 +97,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_x: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -171,6 +174,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; CHECK-G-MESA3D-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -184,6 +188,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_cluster_id_y: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -260,6 +265,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; CHECK-MESA3D-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -268,6 +274,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_y: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -344,6 +351,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; CHECK-G-MESA3D-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -357,6 +365,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_cluster_id_z: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-UNKNOWN-NEXT: s_wait_xcnt 0x0 ; CHECK-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16 @@ -436,6 +445,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; CHECK-MESA3D-NEXT: s_lshr_b32 s2, ttmp7, 16 ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -446,6 +456,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_z: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_wait_xcnt 0x0 ; CHECK-G-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16 @@ -525,6 +536,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; CHECK-G-MESA3D-NEXT: s_lshr_b32 s2, ttmp7, 16 ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll index 3019c1d897d98..7f4d35916668d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll @@ -10,6 +10,7 @@ declare void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) %gaddr define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b8_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 @@ -18,6 +19,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gadd ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b8_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -34,6 +36,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b8_vaddr_imm_mask: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 m0, 15 ; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b8 v2, v[0:1], off offset:16 @@ -41,6 +44,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr_imm_mask(ptr addrspace ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b8_vaddr_imm_mask: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -56,6 +60,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { ; GFX1250-LABEL: cluster_load_async_to_lds_b8_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_async_to_lds_b8 v0, v1, s[0:1] offset:16 @@ -69,6 +74,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b32_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 @@ -77,6 +83,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gad ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b32_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -93,6 +100,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b32_vaddr_imm_mask: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 m0, 15 ; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b32 v2, v[0:1], off offset:16 @@ -100,6 +108,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr_imm_mask(ptr addrspac ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b32_vaddr_imm_mask: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -115,6 +124,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b32_saddr( ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { ; GFX1250-LABEL: cluster_load_async_to_lds_b32_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 @@ -128,6 +138,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 @@ -136,6 +147,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gad ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -152,6 +164,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr_imm_mask( ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_vaddr_imm_mask: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: s_movk_i32 m0, 0x7f ; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b64 v2, v[0:1], off offset:16 @@ -159,6 +172,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr_imm_mask( ptr addrspa ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_vaddr_imm_mask: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -174,6 +188,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { ; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 @@ -187,6 +202,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b128_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-SDAG-NEXT: s_mov_b32 m0, s0 @@ -195,6 +211,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr(ptr addrspace(1) %ga ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b128_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -211,6 +228,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b128_vaddr_imm_mask: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: s_movk_i32 m0, 0x7f ; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b128 v2, v[0:1], off offset:16 @@ -218,6 +236,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr_imm_mask(ptr addrspa ; ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b128_vaddr_imm_mask: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -233,6 +252,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) { ; GFX1250-LABEL: cluster_load_async_to_lds_b128_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_async_to_lds_b128 v0, v1, s[0:1] offset:16 @@ -246,6 +266,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) { ; GFX1250-LABEL: cluster_load_async_to_lds_b32_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_endpgm @@ -259,6 +280,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) { ; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_endpgm @@ -272,6 +294,7 @@ entry: define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) { ; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll index 7746dc60ddfc3..27efce3d35089 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll @@ -9,6 +9,7 @@ declare <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1), i32 define amdgpu_ps void @cluster_load_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) { ; GFX1250-LABEL: cluster_load_b32_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 ; GFX1250-NEXT: s_mov_b32 m0, s0 ; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT @@ -25,6 +26,7 @@ entry: define amdgpu_ps void @cluster_load_b32_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: cluster_load_b32_vaddr_imm_mask: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 m0, 7 ; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -40,6 +42,7 @@ entry: define amdgpu_ps void @cluster_load_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) { ; GFX1250-LABEL: cluster_load_b32_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV @@ -56,6 +59,7 @@ entry: define amdgpu_ps void @cluster_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) { ; GFX1250-LABEL: cluster_load_monitor_b32_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -72,6 +76,7 @@ entry: define amdgpu_ps void @cluster_load_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) { ; GFX1250-LABEL: cluster_load_b64_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 ; GFX1250-NEXT: s_mov_b32 m0, s0 ; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT @@ -88,6 +93,7 @@ entry: define amdgpu_ps void @cluster_load_b64_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: cluster_load_b64_vaddr_imm_mask: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 m0, 0x10007 ; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -103,6 +109,7 @@ entry: define amdgpu_ps void @cluster_load_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) { ; GFX1250-LABEL: cluster_load_b64_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV @@ -119,6 +126,7 @@ entry: define amdgpu_ps void @cluster_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) { ; GFX1250-LABEL: cluster_load_monitor_b64_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -135,6 +143,7 @@ entry: define amdgpu_ps void @cluster_load_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) { ; GFX1250-LABEL: cluster_load_b128_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 ; GFX1250-NEXT: s_mov_b32 m0, s0 ; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_NT @@ -151,6 +160,7 @@ entry: define amdgpu_ps void @cluster_load_b128_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: cluster_load_b128_vaddr_imm_mask: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 m0, 15 ; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -166,6 +176,7 @@ entry: define amdgpu_ps void @cluster_load_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) { ; GFX1250-LABEL: cluster_load_b128_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_mov_b32 m0, s2 ; GFX1250-NEXT: cluster_load_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll index 3ef84a3943d14..b16fa1e5f5831 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll @@ -11,6 +11,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.id.z() #0 define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_workgroup_id_x: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_and_b32 s2, ttmp6, 15 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -89,6 +90,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_and_b32 s2, ttmp6, 15 ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -99,6 +101,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_and_b32 s2, ttmp6, 15 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -177,6 +180,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_and_b32 s2, ttmp6, 15 ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -192,6 +196,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1,2,2" { ; CHECK-UNKNOWN-LABEL: test_workgroup_id_x_optimized: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -268,6 +273,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -276,6 +282,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x_optimized: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -352,6 +359,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -365,6 +373,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_workgroup_id_y: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40004 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -443,6 +452,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40004 ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -453,6 +463,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40004 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -531,6 +542,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40004 ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -546,6 +558,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" { ; CHECK-UNKNOWN-LABEL: test_workgroup_id_y_optimized: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -622,6 +635,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -630,6 +644,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y_optimized: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -706,6 +721,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -719,6 +735,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_workgroup_id_z: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40008 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -797,6 +814,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40008 ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -807,6 +825,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40008 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -885,6 +904,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40008 ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -900,6 +920,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; CHECK-UNKNOWN-LABEL: test_workgroup_flat_id: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4) ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -978,6 +999,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4) ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -988,6 +1010,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_flat_id: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4) ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1066,6 +1089,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4) ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1081,6 +1105,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,2,1" { ; CHECK-UNKNOWN-LABEL: test_workgroup_id_z_optimized: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -1157,6 +1182,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -1165,6 +1191,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z_optimized: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -1241,6 +1268,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll index b8ff9e5ae0366..25e4b9887ffe3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll @@ -9,6 +9,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.max.flat.id() #0 define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_workgroup_max_flat_id: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40018 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -87,6 +88,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40018 ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -97,6 +99,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_flat_id: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40018 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -175,6 +178,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40018 ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll index 9bca696b73437..bc44168960c74 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll @@ -11,6 +11,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.max.id.z() #0 define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x4000c ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -89,6 +90,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x4000c ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -99,6 +101,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x4000c ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -177,6 +180,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x4000c ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -192,6 +196,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" { ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -268,6 +273,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -276,6 +282,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -352,6 +359,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -365,6 +373,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40010 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -443,6 +452,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40010 ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -453,6 +463,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40010 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -531,6 +542,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40010 ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -546,6 +558,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" { ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -622,6 +635,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -630,6 +644,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -706,6 +721,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -719,6 +735,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40014 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -797,6 +814,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40014 ; CHECK-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -807,6 +825,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: s_bfe_u32 s2, ttmp6, 0x40014 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -885,6 +904,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: s_bfe_u32 s2, ttmp6, 0x40014 ; CHECK-G-MESA3D-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -900,6 +920,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" { ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized: ; CHECK-UNKNOWN: ; %bb.0: +; CHECK-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -976,6 +997,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-MESA3D-NEXT: ; %bb.0: +; CHECK-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-MESA3D-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6 ; CHECK-MESA3D-NEXT: s_wait_kmcnt 0x0 @@ -984,6 +1006,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o ; ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized: ; CHECK-G-UNKNOWN: ; %bb.0: +; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 @@ -1060,6 +1083,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: runtime_loader_kernel_symbol = 0 ; CHECK-G-MESA3D-NEXT: .end_amd_kernel_code_t ; CHECK-G-MESA3D-NEXT: ; %bb.0: +; CHECK-G-MESA3D-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; CHECK-G-MESA3D-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; CHECK-G-MESA3D-NEXT: v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0 ; CHECK-G-MESA3D-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll index 95653148b09f4..72611843aff95 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll @@ -12,6 +12,7 @@ declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16) define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -19,6 +20,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte0: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -26,6 +28,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte0: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -33,6 +36,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte0: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -45,6 +49,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte1: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -52,6 +57,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte1: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -59,6 +65,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte1: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -66,6 +73,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte1: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -78,6 +86,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte2: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -85,6 +94,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte2: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -92,6 +102,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte2: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -99,6 +110,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte2: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -111,6 +123,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -118,6 +131,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -125,6 +139,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -132,6 +147,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -144,12 +160,14 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.h, v0 byte_sel:3 ; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 ; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 @@ -157,6 +175,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 @@ -164,6 +183,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 @@ -178,6 +198,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) { define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e32 v0.l, v0 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -185,6 +206,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte0: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e32 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -192,6 +214,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte0: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e32 v0.l, v0 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -199,6 +222,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte0: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e32 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -211,6 +235,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte1: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -218,6 +243,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte1: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:1 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -225,6 +251,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte1: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -232,6 +259,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte1: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:1 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -244,6 +272,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) { define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte2: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -251,6 +280,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte2: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:2 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -258,6 +288,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte2: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -265,6 +296,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte2: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:2 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -277,6 +309,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) { define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3 ; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -284,6 +317,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) { ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -291,6 +325,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l @@ -298,6 +333,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -310,12 +346,14 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) { define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.h, v0 byte_sel:3 ; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 ; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 @@ -323,6 +361,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 @@ -330,6 +369,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 @@ -344,21 +384,25 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) { define amdgpu_ps float @test_cvt_pk_f16_bf8_v(i16 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_bf8_v: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_f16_bf8 v0, v0.l ; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_bf8_v: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_f16_bf8 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_bf8_v: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_f16_bf8 v0, v0.l ; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_bf8_v: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_f16_bf8 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 %a) @@ -369,6 +413,7 @@ define amdgpu_ps float @test_cvt_pk_f16_bf8_v(i16 %a) { define amdgpu_ps float @test_cvt_pk_f16_bf8_s(i16 inreg %a) { ; GFX1250-LABEL: test_cvt_pk_f16_bf8_s: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_pk_f16_bf8 v0, s0 ; GFX1250-NEXT: ; return to shader part epilog %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 %a) @@ -379,21 +424,25 @@ define amdgpu_ps float @test_cvt_pk_f16_bf8_s(i16 inreg %a) { define amdgpu_ps float @test_cvt_pk_f16_fp8_v(i16 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_fp8_v: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_f16_fp8 v0, v0.l ; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_fp8_v: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_f16_fp8 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_fp8_v: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_f16_fp8 v0, v0.l ; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_fp8_v: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_f16_fp8 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 %a) @@ -404,6 +453,7 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_v(i16 %a) { define amdgpu_ps float @test_cvt_pk_f16_fp8_s(i16 inreg %a) { ; GFX1250-LABEL: test_cvt_pk_f16_fp8_s: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_pk_f16_fp8 v0, s0 ; GFX1250-NEXT: ; return to shader part epilog %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 %a) @@ -414,11 +464,13 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_s(i16 inreg %a) { define amdgpu_ps float @test_cvt_pk_f16_fp8_v_hi(<2 x i16> %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_fp8_v_hi: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_f16_fp8 v0, v0.h ; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_fp8_v_hi: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_f16_fp8 v0, v0 @@ -426,6 +478,7 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_v_hi(<2 x i16> %a) { ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_fp8_v_hi: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_f16_fp8 v0, v0.l @@ -433,6 +486,7 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_v_hi(<2 x i16> %a) { ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_fp8_v_hi: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_f16_fp8 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll index 856290ab2e868..3372868455d65 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -10,6 +10,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { ; ; GFX1250-LABEL: test_cvt_f32_bf8_byte0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) @@ -25,6 +26,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { ; ; GFX1250-LABEL: test_cvt_f32_bf8_byte1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) @@ -40,6 +42,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { ; ; GFX1250-LABEL: test_cvt_f32_bf8_byte2: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) @@ -55,6 +58,7 @@ define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) { ; ; GFX1250-LABEL: test_cvt_f32_fp8_byte3: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) @@ -71,6 +75,7 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr ; ; GFX1250-LABEL: test_cvt_pk_bf8_f32_word0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off @@ -93,6 +98,7 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr ; ; GFX1250-LABEL: test_cvt_pk_fp8_f32_word1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -115,6 +121,7 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a ; ; GFX1250-LABEL: test_cvt_sr_bf8_f32_byte0: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off @@ -135,6 +142,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a ; ; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off @@ -155,6 +163,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr a ; ; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte2: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll index fd51759f50d48..0dd0f7648c94a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll @@ -72,6 +72,7 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) { define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { ; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -81,6 +82,7 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old, ; ; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -90,6 +92,7 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old, ; ; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word1_dpp: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -158,6 +161,7 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) { define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { ; GFX1250-TRUE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-TRUE16-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-TRUE16-NEXT: global_store_b32 v[4:5], v2, off @@ -165,6 +169,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, p ; ; GFX1250-FAKE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-FAKE16-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -172,6 +177,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, p ; ; GFX1250-GISEL-LABEL: test_cvt_sr_fp8_f32_byte1_dpp: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll index ff4756318b2dc..dea104ee0b42a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll @@ -12,6 +12,7 @@ declare i32 @llvm.amdgcn.cvt.sr.fp8.f16(half, i32, i32, i32) define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_bf8_f16_v: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_bf8_f16 v0.l, v0 ; GFX1250-SDAG-REAL16-NEXT: global_store_b16 v[2:3], v0, off @@ -19,6 +20,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_bf8_f16_v: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_bf8_f16 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v0, off @@ -26,6 +28,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_bf8_f16_v: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_bf8_f16 v0.l, v0 ; GFX1250-GISEL-REAL16-NEXT: global_store_b16 v[4:5], v0, off @@ -33,6 +36,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_bf8_f16_v: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_bf8_f16 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b16 v[4:5], v0, off @@ -45,24 +49,28 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou define amdgpu_ps void @test_cvt_pk_bf8_f16_s(<2 x half> inreg %a, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_bf8_f16_s: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_bf8_f16 v2.l, s0 ; GFX1250-SDAG-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-REAL16-NEXT: s_endpgm ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_bf8_f16_s: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_bf8_f16 v2, s0 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_bf8_f16_s: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_bf8_f16 v2.l, s0 ; GFX1250-GISEL-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-REAL16-NEXT: s_endpgm ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_bf8_f16_s: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_bf8_f16 v2, s0 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-FAKE16-NEXT: s_endpgm @@ -74,24 +82,28 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_s(<2 x half> inreg %a, ptr addrspace( define amdgpu_ps void @test_cvt_pk_bf8_f16_l(ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_bf8_f16_l: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_bf8_f16 v2.l, 0x56400000 ; GFX1250-SDAG-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-REAL16-NEXT: s_endpgm ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_bf8_f16_l: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_bf8_f16 v2, 0x56400000 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_bf8_f16_l: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_bf8_f16 v2.l, 0x56400000 ; GFX1250-GISEL-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-REAL16-NEXT: s_endpgm ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_bf8_f16_l: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_bf8_f16 v2, 0x56400000 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-FAKE16-NEXT: s_endpgm @@ -103,6 +115,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_l(ptr addrspace(1) %out) { define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_fp8_f16_v: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_fp8_f16 v0.l, v0 ; GFX1250-SDAG-REAL16-NEXT: global_store_b16 v[2:3], v0, off @@ -110,6 +123,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_fp8_f16_v: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_fp8_f16 v0, v0 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v0, off @@ -117,6 +131,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_fp8_f16_v: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_fp8_f16 v0.l, v0 ; GFX1250-GISEL-REAL16-NEXT: global_store_b16 v[4:5], v0, off @@ -124,6 +139,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_fp8_f16_v: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_fp8_f16 v0, v0 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b16 v[4:5], v0, off @@ -136,24 +152,28 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou define amdgpu_ps void @test_cvt_pk_fp8_f16_s(<2 x half> inreg %a, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_fp8_f16_s: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_fp8_f16 v2.l, s0 ; GFX1250-SDAG-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-REAL16-NEXT: s_endpgm ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_fp8_f16_s: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_fp8_f16 v2, s0 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_fp8_f16_s: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_fp8_f16 v2.l, s0 ; GFX1250-GISEL-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-REAL16-NEXT: s_endpgm ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_fp8_f16_s: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_fp8_f16 v2, s0 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-FAKE16-NEXT: s_endpgm @@ -165,24 +185,28 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_s(<2 x half> inreg %a, ptr addrspace( define amdgpu_ps void @test_cvt_pk_fp8_f16_l(ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_fp8_f16_l: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_pk_fp8_f16 v2.l, 0x56400000 ; GFX1250-SDAG-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-REAL16-NEXT: s_endpgm ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_fp8_f16_l: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_pk_fp8_f16 v2, 0x56400000 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-SDAG-FAKE16-NEXT: s_endpgm ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_fp8_f16_l: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_pk_fp8_f16 v2.l, 0x56400000 ; GFX1250-GISEL-REAL16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-REAL16-NEXT: s_endpgm ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_fp8_f16_l: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_pk_fp8_f16 v2, 0x56400000 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX1250-GISEL-FAKE16-NEXT: s_endpgm @@ -194,6 +218,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_l(ptr addrspace(1) %out) { define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -201,6 +226,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte0: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -208,6 +234,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte0: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -215,6 +242,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte0: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -227,6 +255,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte1: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:1 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -234,6 +263,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte1: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:1 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -241,6 +271,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte1: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:1 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -248,6 +279,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte1: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:1 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -260,6 +292,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte2: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:2 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -267,6 +300,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte2: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:2 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -274,6 +308,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte2: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:2 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -281,6 +316,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte2: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:2 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -293,6 +329,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte3: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:3 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -300,6 +337,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte3: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:3 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -307,6 +345,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte3: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:3 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -314,6 +353,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte3: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:3 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -326,6 +366,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_hi_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_bf8_f16 v2, v0.h, v1 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -333,6 +374,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_hi_byte0: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_lshrrev_b32 v0, 16, v0 ; GFX1250-SDAG-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -342,6 +384,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_hi_byte0: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-REAL16-NEXT: v_mov_b32_e32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -351,6 +394,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_hi_byte0: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -366,6 +410,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -373,6 +418,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte0: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -380,6 +426,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte0: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -387,6 +434,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte0: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -399,6 +447,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte1: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:1 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -406,6 +455,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte1: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:1 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -413,6 +463,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte1: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:1 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -420,6 +471,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte1: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:1 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -432,6 +484,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte2: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:2 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -439,6 +492,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte2: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:2 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -446,6 +500,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte2: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:2 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -453,6 +508,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte2: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:2 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -465,6 +521,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte3: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:3 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -472,6 +529,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte3: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:3 ; GFX1250-SDAG-FAKE16-NEXT: global_store_b32 v[4:5], v2, off @@ -479,6 +537,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte3: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:3 ; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[6:7], v2, off @@ -486,6 +545,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte3: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:3 ; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[6:7], v2, off @@ -498,6 +558,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 %old, ptr addrspace(1) %out) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_hi_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-REAL16-NEXT: v_cvt_sr_fp8_f16 v2, v0.h, v1 ; GFX1250-SDAG-REAL16-NEXT: global_store_b32 v[4:5], v2, off @@ -505,6 +566,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 ; ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_hi_byte0: ; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_lshrrev_b32 v0, 16, v0 ; GFX1250-SDAG-FAKE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -514,6 +576,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 ; ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_hi_byte0: ; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-REAL16-NEXT: v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-REAL16-NEXT: v_mov_b32_e32 v7, v4 ; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -523,6 +586,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 ; ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_hi_byte0: ; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-FAKE16-NEXT: v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v4 ; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll index 2179800f9d317..d6ed869d28354 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll @@ -7,6 +7,7 @@ declare <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float, float, i32) #0 define amdgpu_ps float @cvt_sr_pk_f16_f32_vvv(float %src0, float %src1, i32 %src2) #1 { ; GCN-LABEL: cvt_sr_pk_f16_f32_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_f16_f32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 %src2) #0 @@ -17,6 +18,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_vvv(float %src0, float %src1, i32 %src define amdgpu_ps float @cvt_sr_pk_f16_f32_sss(float inreg %src0, float inreg %src1, i32 inreg %src2) #1 { ; GCN-LABEL: cvt_sr_pk_f16_f32_sss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_cvt_sr_pk_f16_f32 v0, s0, s1, v0 @@ -29,6 +31,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_sss(float inreg %src0, float inreg %sr define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi(float %src0, float %src1) #1 { ; GCN-LABEL: cvt_sr_pk_f16_f32_vvi: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_f16_f32 v0, v0, v1, 0x10002 ; GCN-NEXT: ; return to shader part epilog %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 65538) #0 @@ -39,6 +42,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi(float %src0, float %src1) #1 { define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi_mods(float %src0, float %src1) #1 { ; GCN-LABEL: cvt_sr_pk_f16_f32_vvi_mods: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_f16_f32 v0, -v0, |v1|, 1 ; GCN-NEXT: ; return to shader part epilog %s0 = fneg float %src0 @@ -51,6 +55,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi_mods(float %src0, float %src1) #1 define amdgpu_ps float @cvt_sr_pk_f16_f32_ssi(float inreg %src0, float inreg %src1) #1 { ; GCN-LABEL: cvt_sr_pk_f16_f32_ssi: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_f16_f32 v0, s0, s1, 1 ; GCN-NEXT: ; return to shader part epilog %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll index 559b1b171031d..05988c908cde7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll @@ -10,6 +10,7 @@ declare i16 @llvm.amdgcn.sat.pk4.u4.u8(i32) #0 define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 { ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_v: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_clause 0x1 ; SDAG-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -21,6 +22,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 { ; ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_clause 0x1 ; SDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -32,6 +34,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 { ; ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v: ; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-REAL16-NEXT: s_clause 0x1 ; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -43,6 +46,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 { ; ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v: ; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-FAKE16-NEXT: s_clause 0x1 ; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -59,6 +63,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 { define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 { ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s8 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -68,6 +73,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 { ; ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s8 @@ -77,6 +83,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 { ; ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s: ; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-REAL16-NEXT: s_clause 0x1 ; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -88,6 +95,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 { ; ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s: ; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-FAKE16-NEXT: s_clause 0x1 ; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -104,6 +112,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 { define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 { ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_i: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -113,6 +122,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 { ; ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, 0x64 @@ -122,6 +132,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 { ; ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i: ; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64 ; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -131,6 +142,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 { ; ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i: ; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, 0x64 ; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -145,6 +157,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 { define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 { ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_v: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_clause 0x1 ; SDAG-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -156,6 +169,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 { ; ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_clause 0x1 ; SDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -167,6 +181,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 { ; ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v: ; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-REAL16-NEXT: s_clause 0x1 ; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -178,6 +193,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 { ; ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v: ; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-FAKE16-NEXT: s_clause 0x1 ; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -194,6 +210,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 { define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 { ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s8 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -203,6 +220,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 { ; ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s8 @@ -212,6 +230,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 { ; ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s: ; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-REAL16-NEXT: s_clause 0x1 ; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -223,6 +242,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 { ; ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s: ; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-FAKE16-NEXT: s_clause 0x1 ; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -239,6 +259,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 { define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 { ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_i: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -248,6 +269,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 { ; ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, 0x64 @@ -257,6 +279,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 { ; ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i: ; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64 ; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -266,6 +289,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 { ; ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i: ; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, 0x64 ; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll index 5c439f631a426..74b1818c44d45 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll @@ -21,6 +21,7 @@ declare <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> %src, i32 %sc define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_fp8_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1 ; GFX1250-SDAG-NEXT: global_store_b128 v[8:9], v[4:7], off @@ -28,6 +29,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_fp8_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1 ; GFX1250-GISEL-NEXT: global_store_b128 v[8:9], v[4:7], off @@ -40,6 +42,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_bf8_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2 ; GFX1250-SDAG-NEXT: global_store_b128 v[8:9], v[4:7], off @@ -47,6 +50,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale, ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_bf8_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2 ; GFX1250-GISEL-NEXT: global_store_b128 v[8:9], v[4:7], off @@ -59,6 +63,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale, define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp8_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 ; GFX1250-NEXT: v_cvt_scale_pk8_bf16_fp8 v[4:7], v[0:1], v2 scale_sel:1 ; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off @@ -71,6 +76,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp8_vv(<2 x i32> %src, i32 %scale define amdgpu_ps void @test_cvt_scale_pk8_bf16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk8_bf16_bf8_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 ; GFX1250-NEXT: v_cvt_scale_pk8_bf16_bf8 v[4:7], v[0:1], v2 scale_sel:2 ; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off @@ -83,6 +89,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_bf16_bf8_vv(<2 x i32> %src, i32 %scale define amdgpu_ps void @test_cvt_scale_pk8_f16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk8_f16_fp4_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scale_pk8_f16_fp4 v[4:7], v0, v1 scale_sel:3 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX1250-NEXT: s_endpgm @@ -94,6 +101,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_fp4_vv(i32 %src, i32 %scale, ptr a define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp4_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scale_pk8_bf16_fp4 v[4:7], v0, v1 scale_sel:4 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX1250-NEXT: s_endpgm @@ -105,6 +113,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp4_vv(i32 %src, i32 %scale, ptr define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp8_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:8 ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -114,6 +123,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale, ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp8_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:8 ; GFX1250-GISEL-NEXT: s_clause 0x1 @@ -128,6 +138,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale, define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_bf8_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2 ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -137,6 +148,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale, ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_bf8_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2 ; GFX1250-GISEL-NEXT: s_clause 0x1 @@ -151,6 +163,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale, define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp4_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1 ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 @@ -159,6 +172,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr a ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp4_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1 ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: global_store_b128 v[2:3], v[4:7], off @@ -172,6 +186,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr a define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3 ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 @@ -180,6 +195,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3 ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off @@ -193,6 +209,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v12, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -204,6 +221,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v10, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -220,6 +238,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scale_pk16_bf16_fp6 v[6:13], v[0:2], v3 scale_sel:2 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 @@ -233,6 +252,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_vv(<3 x i32> %src, i32 %scal define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_sl: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 ; GFX1250-NEXT: v_mov_b32_e32 v12, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -249,6 +269,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_sl(<3 x i32> inreg %src, ptr define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4 ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 @@ -257,6 +278,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4 ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off @@ -270,6 +292,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v12, s2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -281,6 +304,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v10, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -297,6 +321,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scale_pk16_bf16_bf6 v[6:13], v[0:2], v3 scale_sel:6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 @@ -310,6 +335,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_vv(<3 x i32> %src, i32 %scal define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_sl: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 ; GFX1250-NEXT: v_mov_b32_e32 v12, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -326,6 +352,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_sl(<3 x i32> inreg %src, ptr define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_fp6_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5 ; GFX1250-SDAG-NEXT: s_clause 0x3 ; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 @@ -336,6 +363,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_fp6_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5 ; GFX1250-GISEL-NEXT: s_clause 0x3 ; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off @@ -351,6 +379,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale define amdgpu_ps void @test_cvt_scale_pk16_f32_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_bf6_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6 ; GFX1250-SDAG-NEXT: s_clause 0x3 ; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 @@ -361,6 +390,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f32_bf6_vv(<3 x i32> %src, i32 %scale ; ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_bf6_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6 ; GFX1250-GISEL-NEXT: s_clause 0x3 ; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll index dfb908930750f..ef3147ec5c5a6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll @@ -13,6 +13,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %s ; GFX1210-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv: ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16 ; GFX1250-SDAG-NEXT: global_store_b96 v[22:23], v[18:20], off @@ -20,6 +21,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %s ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16 ; GFX1250-GISEL-NEXT: global_store_b96 v[22:23], v[18:20], off @@ -32,6 +34,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %s define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -47,6 +50,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, pt ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] @@ -67,6 +71,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, pt define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16 ; GFX1250-SDAG-NEXT: global_store_b96 v[22:23], v[18:20], off @@ -74,6 +79,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %s ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16 ; GFX1250-GISEL-NEXT: global_store_b96 v[22:23], v[18:20], off @@ -86,6 +92,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %s define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -101,6 +108,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, pt ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] @@ -121,6 +129,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, pt define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8 ; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -128,6 +137,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8 ; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -140,6 +150,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -151,6 +162,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -167,6 +179,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8 ; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -174,6 +187,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %sc ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8 ; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -186,6 +200,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %sc define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -197,6 +212,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -213,6 +229,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8 ; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -220,6 +237,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8 ; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -232,6 +250,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -243,6 +262,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -259,6 +279,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8 ; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -266,6 +287,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %sc ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8 ; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off @@ -278,6 +300,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %sc define amdgpu_ps void @test_scalef32_pk16_fp6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -289,6 +312,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f16_sl(<16 x half> inreg %src, ptr ; ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll index cd0b081bf6f10..6839975c3270c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll @@ -15,6 +15,7 @@ declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float %sca define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off @@ -22,6 +23,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %s ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4 ; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off @@ -34,6 +36,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %s define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -43,6 +46,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, pt ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -57,6 +61,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, pt define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off @@ -64,6 +69,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %s ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4 ; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off @@ -76,6 +82,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %s define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -85,6 +92,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, pt ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -99,6 +107,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, pt define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[8:9], v[0:3], v4 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off @@ -106,6 +115,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scal ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[0:3], v4 ; GFX1250-GISEL-NEXT: global_store_b64 v[8:9], v[6:7], off @@ -118,6 +128,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scal define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -127,6 +138,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr a ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -141,6 +153,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr a define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[8:9], v[0:3], v4 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off @@ -148,6 +161,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scal ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[0:3], v4 ; GFX1250-GISEL-NEXT: global_store_b64 v[8:9], v[6:7], off @@ -160,6 +174,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scal define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -169,6 +184,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr a ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -183,6 +199,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr a define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[12:13], v[0:7], v8 ; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off @@ -190,6 +207,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %sca ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[0:7], v8 ; GFX1250-GISEL-NEXT: global_store_b64 v[12:13], v[10:11], off @@ -202,6 +220,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %sca define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -213,6 +232,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -229,6 +249,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[12:13], v[0:7], v8 ; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off @@ -236,6 +257,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %sca ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[0:7], v8 ; GFX1250-GISEL-NEXT: global_store_b64 v[12:13], v[10:11], off @@ -248,6 +270,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %sca define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -259,6 +282,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -275,6 +299,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8 ; GFX1250-SDAG-NEXT: global_store_b32 v[10:11], v9, off @@ -282,6 +307,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %sca ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8 ; GFX1250-GISEL-NEXT: global_store_b32 v[12:13], v9, off @@ -294,6 +320,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %sca define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -305,6 +332,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -321,6 +349,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4 ; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v5, off @@ -328,6 +357,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scal ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4 ; GFX1250-GISEL-NEXT: global_store_b32 v[8:9], v5, off @@ -340,6 +370,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scal define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -349,6 +380,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr a ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -363,6 +395,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr a define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4 ; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v5, off @@ -370,6 +403,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %s ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4 ; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v5, off @@ -382,6 +416,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %s define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -391,6 +426,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, pt ; ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll index d33acf6ca7f76..b4789541044b1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll @@ -15,12 +15,14 @@ declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5 ; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -32,6 +34,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_vv(<8 x bfloat> %src, i32 % define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -41,6 +44,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -55,12 +59,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5 ; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -72,6 +78,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_vv(<8 x bfloat> %src, i32 % define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -81,6 +88,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -95,12 +103,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5 ; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -112,6 +122,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_vv(<8 x half> %src, i32 %sr, define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -121,6 +132,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i3 ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -135,12 +147,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i3 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5 ; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5 ; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -152,6 +166,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_vv(<8 x half> %src, i32 %sr, define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -161,6 +176,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i3 ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -175,12 +191,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i3 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9 ; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9 ; GFX1250-GISEL-NEXT: global_store_b64 v[10:11], v[12:13], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -192,6 +210,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_vv(<8 x float> %src, i32 %sr define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -203,6 +222,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -219,12 +239,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9 ; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9 ; GFX1250-GISEL-NEXT: global_store_b64 v[10:11], v[12:13], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -236,6 +258,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_vv(<8 x float> %src, i32 %sr define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -247,6 +270,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -263,12 +287,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9 ; GFX1250-SDAG-NEXT: global_store_b32 v[10:11], v12, off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9 ; GFX1250-GISEL-NEXT: global_store_b32 v[10:11], v12, off ; GFX1250-GISEL-NEXT: s_endpgm @@ -280,6 +306,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_vv(<8 x float> %src, i32 %sr define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -291,6 +318,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -307,12 +335,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5 ; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v8, off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5 ; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v8, off ; GFX1250-GISEL-NEXT: s_endpgm @@ -324,6 +354,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_vv(<8 x half> %src, i32 %sr, define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -333,6 +364,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i3 ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -347,12 +379,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i3 define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5 ; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v8, off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5 ; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v8, off ; GFX1250-GISEL-NEXT: s_endpgm @@ -364,6 +398,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_vv(<8 x bfloat> %src, i32 % define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -373,6 +408,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll index c4395182d6719..0e20fc69886db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll @@ -12,6 +12,7 @@ declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[12:14], v[0:7], v8, v9 ; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off ; GFX1250-NEXT: s_endpgm @@ -23,6 +24,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_sl: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -39,6 +41,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %sr define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f16_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[12:14], v[0:7], v8, v9 ; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off ; GFX1250-NEXT: s_endpgm @@ -50,6 +53,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %s define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -61,6 +65,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -77,6 +82,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[12:14], v[0:7], v8, v9 ; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off ; GFX1250-NEXT: s_endpgm @@ -88,6 +94,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_sl: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -104,6 +111,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %sr define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f16_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[12:14], v[0:7], v8, v9 ; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off ; GFX1250-NEXT: s_endpgm @@ -115,6 +123,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %s define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f16_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -126,6 +135,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_fp6_f16_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] @@ -142,6 +152,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_vv(<16 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f32_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f32 v[20:22], v[0:15], v16, v17 ; GFX1250-NEXT: global_store_b96 v[18:19], v[20:22], off ; GFX1250-NEXT: s_endpgm @@ -153,6 +164,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_vv(<16 x float> %src, i32 % define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_sl(<16 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -168,6 +180,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_sl(<16 x float> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] @@ -188,6 +201,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_sl(<16 x float> inreg %src, define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_vv(<16 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f32_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f32 v[20:22], v[0:15], v16, v17 ; GFX1250-NEXT: global_store_b96 v[18:19], v[20:22], off ; GFX1250-NEXT: s_endpgm @@ -199,6 +213,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_vv(<16 x float> %src, i32 % define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_sl(<16 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f32_sl: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 @@ -214,6 +229,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_sl(<16 x float> inreg %src, ; ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_fp6_f32_sl: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll index 82991aed706a1..1fb3b1a97f0e4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll @@ -9,6 +9,7 @@ declare <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float, float, i32) #0 define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvv(float %src0, float %src1, i32 %src2) #1 { ; GCN-LABEL: cvt_sr_pk_bf16_f32_vvv: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 %src2) #0 @@ -19,6 +20,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvv(float %src0, float %src1, i32 %sr define amdgpu_ps float @cvt_sr_pk_bf16_f32_sss(float inreg %src0, float inreg %src1, i32 inreg %src2) #1 { ; GCN-LABEL: cvt_sr_pk_bf16_f32_sss: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, s0, s1, v0 @@ -31,6 +33,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_sss(float inreg %src0, float inreg %s define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi(float %src0, float %src1) #1 { ; GCN-LABEL: cvt_sr_pk_bf16_f32_vvi: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, v0, v1, 0x10002 ; GCN-NEXT: ; return to shader part epilog %cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 65538) #0 @@ -41,6 +44,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi(float %src0, float %src1) #1 { define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi_mods(float %src0, float %src1) #1 { ; GCN-LABEL: cvt_sr_pk_bf16_f32_vvi_mods: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, -v0, |v1|, 1 ; GCN-NEXT: ; return to shader part epilog %s0 = fneg float %src0 @@ -53,6 +57,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi_mods(float %src0, float %src1) #1 define amdgpu_ps float @cvt_sr_pk_bf16_f32_ssi(float inreg %src0, float inreg %src1) #1 { ; GCN-LABEL: cvt_sr_pk_bf16_f32_ssi: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, s0, s1, 1 ; GCN-NEXT: ; return to shader part epilog %cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll index d5fba2df0b828..8f4a473a2d5ea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll @@ -7,6 +7,7 @@ declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col) define amdgpu_ps void @flat_prefetch(ptr %ptr) { ; GCN-LABEL: flat_prefetch: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v[0:1] ; GCN-NEXT: s_endpgm entry: @@ -17,6 +18,7 @@ entry: define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) { ; GCN-LABEL: flat_prefetch_sgpr: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] ; GCN-NEXT: s_endpgm @@ -28,6 +30,7 @@ entry: define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) { ; GCN-LABEL: flat_prefetch_offset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512 ; GCN-NEXT: s_endpgm entry: @@ -39,6 +42,7 @@ entry: define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) { ; GCN-LABEL: flat_prefetch_sgpr_voffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] ; GCN-NEXT: s_endpgm entry: @@ -50,6 +54,7 @@ entry: define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) { ; GCN-LABEL: flat_prefetch_sgpr_voffset_offset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 ; GCN-NEXT: s_endpgm entry: @@ -62,6 +67,7 @@ entry: define amdgpu_ps void @flat_prefetch_se(ptr %ptr) { ; GCN-LABEL: flat_prefetch_se: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE ; GCN-NEXT: s_endpgm entry: @@ -72,6 +78,7 @@ entry: define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) { ; GCN-LABEL: flat_prefetch_se_nt: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE ; GCN-NEXT: s_endpgm entry: @@ -82,6 +89,7 @@ entry: define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) { ; GCN-LABEL: flat_prefetch_dev_ht: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV ; GCN-NEXT: s_endpgm entry: @@ -92,6 +100,7 @@ entry: define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) { ; GCN-LABEL: flat_prefetch_sys_lu: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll index d5fae1e4a9657..bf7cce9877f86 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll @@ -10,12 +10,14 @@ declare void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gaddr, define amdgpu_ps void @global_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b8_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b8_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -30,6 +32,7 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_load_async_to_lds_b8_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_load_async_to_lds_b8 v0, v1, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -42,12 +45,14 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b32_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b32_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -62,6 +67,7 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -74,12 +80,14 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -94,6 +102,7 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -106,12 +115,14 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b128_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b128_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -126,6 +137,7 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_load_async_to_lds_b128_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_load_async_to_lds_b128 v0, v1, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -138,6 +150,7 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_endpgm entry: @@ -150,6 +163,7 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_endpgm entry: @@ -162,6 +176,7 @@ entry: define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll index 80f9eeb25ebc0..25989990c50de 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll @@ -7,6 +7,7 @@ declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col) define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) { ; GCN-LABEL: global_prefetch: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v[0:1], off ; GCN-NEXT: s_endpgm entry: @@ -17,6 +18,7 @@ entry: define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) { ; GCN-LABEL: global_prefetch_sgpr: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_prefetch_b8 v0, s[0:1] ; GCN-NEXT: s_endpgm @@ -28,6 +30,7 @@ entry: define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) { ; GCN-LABEL: global_prefetch_offset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512 ; GCN-NEXT: s_endpgm entry: @@ -39,6 +42,7 @@ entry: define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) { ; GCN-LABEL: global_prefetch_sgpr_voffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v0, s[0:1] ; GCN-NEXT: s_endpgm entry: @@ -50,6 +54,7 @@ entry: define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) { ; GCN-LABEL: global_prefetch_sgpr_voffset_offset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 ; GCN-NEXT: s_endpgm entry: @@ -62,6 +67,7 @@ entry: define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) { ; GCN-LABEL: global_prefetch_se: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SE ; GCN-NEXT: s_endpgm entry: @@ -72,6 +78,7 @@ entry: define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) { ; GCN-LABEL: global_prefetch_se_nt: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE ; GCN-NEXT: s_endpgm entry: @@ -82,6 +89,7 @@ entry: define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) { ; GCN-LABEL: global_prefetch_dev_ht: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV ; GCN-NEXT: s_endpgm entry: @@ -92,6 +100,7 @@ entry: define amdgpu_ps void @global_prefetch_sys_lu(ptr addrspace(1) %ptr) { ; GCN-LABEL: global_prefetch_sys_lu: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll index 22563f8e5ff46..2e964fd8fb2c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll @@ -10,12 +10,14 @@ declare void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gad define amdgpu_ps void @global_store_async_from_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b8_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_store_async_from_lds_b8 v[0:1], v2, off offset:16 th:TH_STORE_NT ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b8_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -30,6 +32,7 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_store_async_from_lds_b8_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_store_async_from_lds_b8 v1, v0, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -42,12 +45,14 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b32: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_store_async_from_lds_b32 v[0:1], v2, off offset:16 th:TH_STORE_HT scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b32: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -62,6 +67,7 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_store_async_from_lds_b32_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -74,12 +80,14 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_store_async_from_lds_b64 v[0:1], v2, off offset:16 th:TH_STORE_NT_HT scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -94,6 +102,7 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_store_async_from_lds_b64_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -106,12 +115,14 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b128_vaddr: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1] ; GFX1250-SDAG-NEXT: global_store_async_from_lds_b128 v[0:1], v2, off offset:16 th:TH_STORE_BYPASS scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b128_vaddr: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -126,6 +137,7 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) { ; GFX1250-LABEL: global_store_async_from_lds_b128_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 32 ; GFX1250-NEXT: global_store_async_from_lds_b128 v1, v0, s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm @@ -138,6 +150,7 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { ; GFX1250-LABEL: global_store_async_from_lds_b32_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT ; GFX1250-NEXT: s_endpgm entry: @@ -150,6 +163,7 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { ; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT ; GFX1250-NEXT: s_endpgm entry: @@ -162,6 +176,7 @@ entry: define amdgpu_ps void @global_store_async_from_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { ; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 5b36d4cefa2e3..7de4aa95b3cc2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -61,6 +61,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX1250-LABEL: is_private_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -187,6 +188,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX1250-SDAG-LABEL: is_private_sgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, src_flat_scratch_base_hi @@ -264,6 +266,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; ; GFX1250-GISEL-LABEL: is_private_sgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, src_flat_scratch_base_hi diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 7aecae901becf..3aa1a10ffc918 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -94,6 +94,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; ; GFX1250-LABEL: is_local_vgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base @@ -253,6 +254,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX1250-SDAG-LABEL: is_local_sgpr: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -330,6 +332,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; ; GFX1250-GISEL-LABEL: is_local_sgpr: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll index 350d468344f65..910c55a041ede 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll @@ -12,6 +12,7 @@ declare <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr, i32) define amdgpu_ps void @global_load_monitor_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_monitor_b32_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_monitor_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v[2:3], v0, off @@ -26,6 +27,7 @@ entry: define amdgpu_ps void @global_load_monitor_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_monitor_b32_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -41,6 +43,7 @@ entry: define amdgpu_ps void @global_load_monitor_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_monitor_b64_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_monitor_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off @@ -55,6 +58,7 @@ entry: define amdgpu_ps void @global_load_monitor_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_monitor_b64_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -70,6 +74,7 @@ entry: define amdgpu_ps void @global_load_monitor_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_monitor_b128_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_monitor_b128 v[4:7], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off @@ -84,6 +89,7 @@ entry: define amdgpu_ps void @global_load_monitor_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_monitor_b128_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_monitor_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -99,6 +105,7 @@ entry: define amdgpu_ps void @flat_load_monitor_b32(ptr %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: flat_load_monitor_b32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_monitor_b32 v0, v[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v[2:3], v0, off @@ -113,6 +120,7 @@ entry: define amdgpu_ps void @flat_load_monitor_b64(ptr %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: flat_load_monitor_b64: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_monitor_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off @@ -127,6 +135,7 @@ entry: define amdgpu_ps void @flat_load_monitor_b128(ptr %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: flat_load_monitor_b128: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_load_monitor_b128 v[4:7], v[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off @@ -141,6 +150,7 @@ entry: define amdgpu_ps void @global_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) { ; GFX1250-LABEL: global_load_monitor_b32_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v[0:1], v2, off @@ -156,6 +166,7 @@ entry: define amdgpu_ps void @global_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) { ; GFX1250-LABEL: global_load_monitor_b64_saddr_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_NT ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -171,6 +182,7 @@ entry: define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) { ; GFX1250-LABEL: global_load_monitor_b64_saddr_no_scale_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll index f7ed5341141d4..68e9e2b1bdf5b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll @@ -20,6 +20,7 @@ declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3)) define amdgpu_ps void @global_load_tr4_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr4_b64_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_tr4_b64 v[0:1], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off @@ -34,6 +35,7 @@ entry: define amdgpu_ps void @global_load_tr4_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr4_b64_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr4_b64 v[2:3], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -49,6 +51,7 @@ entry: define amdgpu_ps void @global_load_tr8_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr8_b64_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_tr8_b64 v[0:1], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off @@ -63,6 +66,7 @@ entry: define amdgpu_ps void @global_load_tr8_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr8_b64_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr8_b64 v[2:3], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -78,6 +82,7 @@ entry: define amdgpu_ps void @global_load_tr6_b96_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr6_b96_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_tr6_b96 v[4:6], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b96 v[2:3], v[4:6], off @@ -92,6 +97,7 @@ entry: define amdgpu_ps void @global_load_tr6_b96_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr6_b96_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -107,6 +113,7 @@ entry: define amdgpu_ps void @global_load_tr16_b128_v8i16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8i16_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off @@ -121,6 +128,7 @@ entry: define amdgpu_ps void @global_load_tr16_b128_v8i16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8i16_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -136,6 +144,7 @@ entry: define amdgpu_ps void @global_load_tr16_b128_v8f16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8f16_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off @@ -150,6 +159,7 @@ entry: define amdgpu_ps void @global_load_tr16_b128_v8f16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8f16_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -165,6 +175,7 @@ entry: define amdgpu_ps void @global_load_tr16_b128_v8b16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8b16_vaddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off @@ -179,6 +190,7 @@ entry: define amdgpu_ps void @global_load_tr16_b128_v8bf16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: global_load_tr16_b128_v8bf16_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -194,6 +206,7 @@ entry: define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr4_b64: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -202,6 +215,7 @@ define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1) ; ; GFX1250-GISEL-LABEL: ds_load_tr4_b64: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -217,6 +231,7 @@ entry: define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr8_b64: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -225,6 +240,7 @@ define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) ; ; GFX1250-GISEL-LABEL: ds_load_tr8_b64: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -240,6 +256,7 @@ entry: define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr6_b96: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-SDAG-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -248,6 +265,7 @@ define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) ; ; GFX1250-GISEL-LABEL: ds_load_tr6_b96: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -263,6 +281,7 @@ entry: define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8i16: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -271,6 +290,7 @@ define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrs ; ; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8i16: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -286,6 +306,7 @@ entry: define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8f16: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -294,6 +315,7 @@ define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrs ; ; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8f16: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 @@ -309,6 +331,7 @@ entry: define amdgpu_ps void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX1250-LABEL: ds_load_tr16_b128_v8bf16: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX1250-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll index 42a50bb304bc9..1095c39d257f2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll @@ -5,6 +5,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_bcast_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 @@ -22,6 +23,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_bcast_b32_vii: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -37,6 +39,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_bcast_b32_vll: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -53,6 +56,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-SDAG-LABEL: v_permlane_bcast_b32_vvv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -69,6 +73,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 % ; ; GFX1250-GISEL-LABEL: v_permlane_bcast_b32_vvv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -92,6 +97,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_down_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 @@ -109,6 +115,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_down_b32_vii: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -124,6 +131,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_down_b32_vll: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -140,6 +148,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-SDAG-LABEL: v_permlane_down_b32_vvv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -156,6 +165,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s ; ; GFX1250-GISEL-LABEL: v_permlane_down_b32_vvv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -179,6 +189,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_up_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 @@ -196,6 +207,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_up_b32_vii: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -211,6 +223,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_up_b32_vll: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -227,6 +240,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-SDAG-LABEL: v_permlane_up_b32_vvv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -243,6 +257,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src ; ; GFX1250-GISEL-LABEL: v_permlane_up_b32_vvv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -266,6 +281,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_xor_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 @@ -283,6 +299,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_xor_b32_vii: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -298,6 +315,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_xor_b32_vll: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -314,6 +332,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-SDAG-LABEL: v_permlane_xor_b32_vvv: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -330,6 +349,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %sr ; ; GFX1250-GISEL-LABEL: v_permlane_xor_b32_vvv: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 @@ -353,6 +373,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX1250-LABEL: v_permlane_idx_gen_b32_vs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -368,6 +389,7 @@ define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_idx_gen_b32_vi: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -383,6 +405,7 @@ define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX1250-LABEL: v_permlane_idx_gen_b32_vl: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 @@ -398,6 +421,7 @@ define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane_idx_gen_b32_vv(ptr addrspace(1) %out) { ; GFX1250-LABEL: v_permlane_idx_gen_b32_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_bfe_u32 v1, v0, 10, 10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index dc9e23b0a457d..b0f1caefa5cbd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -31,6 +31,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_atomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -79,6 +80,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_atomic_buffer_load_i32_off: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -126,6 +128,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_atomic_buffer_load_i32_soff: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -174,6 +177,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_atomic_buffer_load_i32_dlc: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -223,6 +227,7 @@ define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_nonatomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -272,6 +277,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; ; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i64: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 @@ -292,6 +298,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; ; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i64: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 @@ -342,6 +349,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_atomic_buffer_load_v2i16: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -453,6 +461,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; ; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 @@ -473,6 +482,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; ; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_v4i16: ; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 @@ -495,6 +505,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { ; ; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 @@ -546,6 +557,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_atomic_buffer_load_v4i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -597,6 +609,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { ; ; GFX12-LABEL: raw_atomic_buffer_load_ptr: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll index 1a1a1f784464d..77fb0ed96d761 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll @@ -5,8 +5,8 @@ ; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX9 ; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10 ; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11 -; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12 -; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12 +; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1200 +; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1250 define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) { ; GFX67-LABEL: raw_buffer_load_i8_tfe: @@ -54,16 +54,28 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_i8_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_u8 v[4:5], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b8 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_i8_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_u8 v[4:5], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b8 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_i8_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_u8 v[4:5], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b8 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { i8, i32 } @llvm.amdgcn.raw.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { i8, i32 } %res, 0 store i8 %data, ptr addrspace(1) %data_addr @@ -118,16 +130,28 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_i16_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b16 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_i16_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b16 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_i16_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { i16, i32 } @llvm.amdgcn.raw.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { i16, i32 } %res, 0 store i16 %data, ptr addrspace(1) %data_addr @@ -182,16 +206,28 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_f16_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b16 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_f16_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b16 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_f16_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_u16 v[4:5], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { half, i32 } @llvm.amdgcn.raw.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { half, i32 } %res, 0 store half %data, ptr addrspace(1) %data_addr @@ -246,16 +282,28 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_b32 v[4:5], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_b32 v[4:5], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b32 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_b32 v[4:5], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { i32, i32 } @llvm.amdgcn.raw.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %res, 0 store i32 %data, ptr addrspace(1) %data_addr @@ -330,16 +378,28 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_v2i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off -; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_v2i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1200-NEXT: global_store_b32 v[2:3], v6, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_v2i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1250-NEXT: global_store_b32 v[2:3], v6, off +; GFX1250-NEXT: s_endpgm %res = call { <2 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %res, 0 store <2 x i32> %data, ptr addrspace(1) %data_addr @@ -414,16 +474,28 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_v2f32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off -; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_v2f32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1200-NEXT: global_store_b32 v[2:3], v6, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_v2f32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: buffer_load_b64 v[4:6], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1250-NEXT: global_store_b32 v[2:3], v6, off +; GFX1250-NEXT: s_endpgm %res = call { <2 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %res, 0 store <2 x float> %data, ptr addrspace(1) %data_addr @@ -503,17 +575,30 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_v3i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off -; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_v3i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1200-NEXT: global_store_b32 v[2:3], v7, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_v3i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_mov_b32_e32 v7, v4 +; GFX1250-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1250-NEXT: global_store_b32 v[2:3], v7, off +; GFX1250-NEXT: s_endpgm %res = call { <3 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %res, 0 store <3 x i32> %data, ptr addrspace(1) %data_addr @@ -593,17 +678,30 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_v3f32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off -; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_v3f32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1200-NEXT: global_store_b32 v[2:3], v7, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_v3f32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_mov_b32_e32 v7, v4 +; GFX1250-NEXT: buffer_load_b96 v[4:7], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1250-NEXT: global_store_b32 v[2:3], v7, off +; GFX1250-NEXT: s_endpgm %res = call { <3 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %res, 0 store <3 x float> %data, ptr addrspace(1) %data_addr @@ -670,17 +768,30 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_v4i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 -; GFX12-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_v4i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1200-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1200-NEXT: global_store_b32 v[2:3], v8, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_v4i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1250-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1250-NEXT: global_store_b32 v[2:3], v8, off +; GFX1250-NEXT: s_endpgm %res = call { <4 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %res, 0 store <4 x i32> %data, ptr addrspace(1) %data_addr @@ -747,17 +858,30 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_v4f32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 -; GFX12-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: raw_buffer_load_v4f32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1200-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1200-NEXT: global_store_b32 v[2:3], v8, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_load_v4f32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1250-NEXT: buffer_load_b128 v[4:8], off, s[0:3], null tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1250-NEXT: global_store_b32 v[2:3], v8, off +; GFX1250-NEXT: s_endpgm %res = call { <4 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %res, 0 store <4 x float> %data, ptr addrspace(1) %data_addr diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index eeea1456792af..f770133e3559f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=GFX68,VERDE %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12 %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck -check-prefixes=GFX12 %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX1200 %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck -check-prefixes=GFX1250 %s define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { ; GFX68-LABEL: buffer_store: @@ -20,6 +20,23 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, < ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 glc ; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 slc ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: s_clause 0x2 +; GFX1200-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX1200-NEXT: buffer_store_b128 v[4:7], off, s[0:3], null th:TH_STORE_NT +; GFX1200-NEXT: buffer_store_b128 v[8:11], off, s[0:3], null th:TH_STORE_HT +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX1250-NEXT: buffer_store_b128 v[4:7], off, s[0:3], null th:TH_STORE_NT +; GFX1250-NEXT: buffer_store_b128 v[8:11], off, s[0:3], null th:TH_STORE_HT +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -37,6 +54,17 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_immoffs: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:42 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_immoffs: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:42 +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) ret void @@ -52,6 +80,17 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_ofs: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_ofs: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) ret void @@ -83,6 +122,24 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_wait: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null offen +; GFX1200-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], null offen +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_wait: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null offen +; GFX1250-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], null offen +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) @@ -100,6 +157,17 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 % ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x1: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x1: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -115,6 +183,17 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x2: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x2: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -133,6 +212,28 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x1_offen_merged_and: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4 +; GFX1200-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], null offen offset:28 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x1_offen_merged_and: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_dual_add_nc_u32 v7, 4, v0 :: v_dual_add_nc_u32 v8, 8, v0 +; GFX1250-NEXT: v_dual_add_nc_u32 v9, 12, v0 :: v_dual_add_nc_u32 v10, 16, v0 +; GFX1250-NEXT: v_dual_add_nc_u32 v11, 28, v0 :: v_dual_add_nc_u32 v0, 32, v0 +; GFX1250-NEXT: s_clause 0x5 +; GFX1250-NEXT: buffer_store_b32 v1, v7, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b32 v2, v8, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b32 v3, v9, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b32 v4, v10, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b32 v5, v11, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b32 v6, v0, s[0:3], null offen +; GFX1250-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -163,6 +264,26 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i3 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x1_offen_merged_or: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4 +; GFX1200-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], null offen offset:28 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x1_offen_merged_or: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_store_b128 v[2:5], v0, s[0:3], null offen offset:4 +; GFX1250-NEXT: buffer_store_b64 v[6:7], v0, s[0:3], null offen offset:28 +; GFX1250-NEXT: s_endpgm %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -194,6 +315,29 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsr ; GFX11-NEXT: buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x1_offen_merged_glc_slc: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x2 +; GFX1200-NEXT: buffer_store_b64 v[1:2], v0, s[0:3], null offen offset:4 +; GFX1200-NEXT: buffer_store_b64 v[3:4], v0, s[0:3], null offen offset:12 th:TH_STORE_NT +; GFX1200-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], null offen offset:28 th:TH_STORE_WB +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x1_offen_merged_glc_slc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_dual_add_nc_u32 v7, 4, v0 :: v_dual_add_nc_u32 v8, 8, v0 +; GFX1250-NEXT: v_dual_add_nc_u32 v9, 12, v0 :: v_dual_add_nc_u32 v10, 16, v0 +; GFX1250-NEXT: v_dual_add_nc_u32 v11, 28, v0 :: v_dual_add_nc_u32 v0, 32, v0 +; GFX1250-NEXT: s_clause 0x5 +; GFX1250-NEXT: buffer_store_b32 v1, v7, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b32 v2, v8, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b32 v3, v9, s[0:3], null offen th:TH_STORE_NT +; GFX1250-NEXT: buffer_store_b32 v4, v10, s[0:3], null offen th:TH_STORE_NT +; GFX1250-NEXT: buffer_store_b32 v5, v11, s[0:3], null offen th:TH_STORE_WB +; GFX1250-NEXT: buffer_store_b32 v6, v0, s[0:3], null offen th:TH_STORE_WB +; GFX1250-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -219,6 +363,22 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x2_offen_merged_and: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x2_offen_merged_and: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: v_dual_add_nc_u32 v1, 4, v0 :: v_dual_add_nc_u32 v0, 12, v0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_store_b64 v[2:3], v1, s[0:3], null offen +; GFX1250-NEXT: buffer_store_b64 v[4:5], v0, s[0:3], null offen +; GFX1250-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) @@ -238,6 +398,21 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i3 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x2_offen_merged_or: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1200-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x2_offen_merged_or: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX1250-NEXT: buffer_store_b128 v[2:5], v0, s[0:3], null offen offset:4 +; GFX1250-NEXT: s_endpgm %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -259,6 +434,21 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, floa ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x1_offset_merged: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 +; GFX1200-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null offset:28 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x1_offset_merged: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 +; GFX1250-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null offset:28 +; GFX1250-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -278,6 +468,17 @@ define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_x2_offset_merged: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_x2_offset_merged: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 +; GFX1250-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) ret void @@ -298,6 +499,23 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 glc ; GFX11-NEXT: buffer_store_b32 v6, off, s[0:3], 0 slc ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_int: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: s_clause 0x2 +; GFX1200-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX1200-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null th:TH_STORE_NT +; GFX1200-NEXT: buffer_store_b32 v6, off, s[0:3], null th:TH_STORE_HT +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_int: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX1250-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null th:TH_STORE_NT +; GFX1250-NEXT: buffer_store_b32 v6, off, s[0:3], null th:TH_STORE_HT +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -317,6 +535,19 @@ define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: raw_buffer_store_byte: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1200-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_store_byte: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1250-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -336,6 +567,19 @@ define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: raw_buffer_store_short: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1200-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_store_short: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1250-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -353,6 +597,17 @@ define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: raw_buffer_store_f16: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_store_f16: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 %cast = bitcast i16 %trunc to half @@ -379,6 +634,17 @@ define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %dat ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_v2f16: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_v2f16: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -407,6 +673,17 @@ define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %dat ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_v4f16: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_v4f16: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -422,6 +699,17 @@ define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: raw_buffer_store_i16: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_store_i16: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1250-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0) @@ -446,6 +734,17 @@ define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_v2i16: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_v2i16: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -472,6 +771,17 @@ define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: buffer_store_v4i16: +; GFX1200: ; %bb.0: ; %main_body +; GFX1200-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: buffer_store_v4i16: +; GFX1250: ; %bb.0: ; %main_body +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen +; GFX1250-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -490,6 +800,21 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: raw_buffer_store_x1_offset_merged: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 +; GFX1200-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null offset:28 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_store_x1_offset_merged: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 +; GFX1250-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null offset:28 +; GFX1250-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -520,6 +845,21 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(< ; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 offset:28 ; GFX11-NEXT: buffer_store_b32 v5, off, s[0:3], 0 offset:32 ; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x1 +; GFX1200-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE +; GFX1200-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null offset:28 scope:SCOPE_SE +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE +; GFX1250-NEXT: buffer_store_b64 v[4:5], off, s[0:3], null offset:28 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) @@ -530,16 +870,41 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(< } define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { -; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4 -; GFX12-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8 -; GFX12-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12 -; GFX12-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16 -; GFX12-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28 -; GFX12-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32 -; GFX12-NEXT: s_endpgm +; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_endpgm +; +; GFX1200-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_clause 0x5 +; GFX1200-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4 +; GFX1200-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8 +; GFX1200-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12 +; GFX1200-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16 +; GFX1200-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28 +; GFX1200-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32 +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_clause 0x5 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4 +; GFX1250-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8 +; GFX1250-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12 +; GFX1250-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16 +; GFX1250-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28 +; GFX1250-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32 +; GFX1250-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 7e8f490e6a7f9..1f8e30152f09d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -31,6 +31,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) % ; ; GFX12-LABEL: raw_ptr_atomic_buffer_ptr_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -79,6 +80,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) % ; ; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_off: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -126,6 +128,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) ; ; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_soff: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -174,6 +177,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) % ; ; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_dlc: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -223,6 +227,7 @@ define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; ; GFX12-LABEL: raw_nonptr_atomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 @@ -272,6 +277,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) ; ; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 @@ -292,6 +298,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) ; ; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 @@ -342,6 +349,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %pt ; ; GFX12-LABEL: raw_ptr_atomic_buffer_load_v2i16: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -453,6 +461,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; ; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 @@ -473,6 +482,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; ; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16: ; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0 @@ -495,6 +505,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt ; ; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 @@ -546,6 +557,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %pt ; ; GFX12-LABEL: raw_ptr_atomic_buffer_load_v4i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 @@ -597,6 +609,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) ; ; GFX12-LABEL: raw_ptr_atomic_buffer_load_ptr: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll index ec7d7d467ffc6..5894073ea47e3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -36,6 +36,7 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d ; ; GFX12-LABEL: buffer_store_bf16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b16 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) @@ -74,6 +75,7 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf ; ; GFX12-LABEL: buffer_store_v2bf16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) @@ -116,6 +118,7 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf ; ; GFX12-LABEL: buffer_store_v4bf16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) @@ -172,6 +175,7 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf ; ; GFX12-LABEL: buffer_store_v8bf16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null offen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll index dbea8325d89f6..24dd47a978a38 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll @@ -11,6 +11,7 @@ declare bfloat @llvm.amdgcn.rcp.bf16(bfloat) #0 define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; SDAG-TRUE16-LABEL: rcp_bf16: ; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-TRUE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 @@ -20,6 +21,7 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; ; SDAG-FAKE16-LABEL: rcp_bf16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -34,6 +36,7 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 { define amdgpu_kernel void @rcp_bf16_constant_4(ptr addrspace(1) %out) #1 { ; SDAG-TRUE16-LABEL: rcp_bf16_constant_4: ; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3e80 @@ -43,6 +46,7 @@ define amdgpu_kernel void @rcp_bf16_constant_4(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: rcp_bf16_constant_4: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -56,6 +60,7 @@ define amdgpu_kernel void @rcp_bf16_constant_4(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @rcp_bf16_constant_100(ptr addrspace(1) %out) #1 { ; SDAG-TRUE16-LABEL: rcp_bf16_constant_100: ; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c24 @@ -65,6 +70,7 @@ define amdgpu_kernel void @rcp_bf16_constant_100(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: rcp_bf16_constant_100: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -78,6 +84,7 @@ define amdgpu_kernel void @rcp_bf16_constant_100(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @rcp_undef_bf16(ptr addrspace(1) %out) #1 { ; SDAG-TRUE16-LABEL: rcp_undef_bf16: ; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7fc0 @@ -87,6 +94,7 @@ define amdgpu_kernel void @rcp_undef_bf16(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: rcp_undef_bf16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll index 662dc61301032..7f98d96737f21 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll @@ -11,6 +11,7 @@ declare bfloat @llvm.amdgcn.rsq.bf16(bfloat) #0 define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; SDAG-REAL16-LABEL: rsq_bf16: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 @@ -20,6 +21,7 @@ define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; ; SDAG-FAKE16-LABEL: rsq_bf16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -34,6 +36,7 @@ define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 { define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: rsq_bf16_constant_4: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, 4.0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -43,6 +46,7 @@ define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: rsq_bf16_constant_4: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, 4.0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -57,6 +61,7 @@ define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: rsq_bf16_constant_100: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, 0x42c8 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -66,6 +71,7 @@ define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: rsq_bf16_constant_100: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, 0x42c8 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -80,10 +86,12 @@ define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @rsq_undef_bf16(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: rsq_undef_bf16: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_endpgm ; ; SDAG-FAKE16-LABEL: rsq_undef_bf16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_endpgm %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat undef) store bfloat %rsq, ptr addrspace(1) %out, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll index 2173d07baa57e..b0574bf021014 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll @@ -5,6 +5,7 @@ define amdgpu_ps void @test_asynccnt() { ; GFX12-LABEL: test_asynccnt: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_wait_asynccnt 0x0 ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.s.wait.asynccnt(i16 0) @@ -14,6 +15,7 @@ define amdgpu_ps void @test_asynccnt() { define amdgpu_ps void @test_tensorcnt() { ; GFX12-LABEL: test_tensorcnt: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_wait_tensorcnt 0x0 ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.s.wait.tensorcnt(i16 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index 94210640db14e..4905c6d8aa81b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -25,6 +25,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; ; GFX1250-SDAG-LABEL: test_get_doorbell: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -34,6 +35,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) { ; ; GFX1250-GISEL-LABEL: test_get_doorbell: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -66,6 +68,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; ; GFX1250-SDAG-LABEL: test_get_ddid: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -75,6 +78,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { ; ; GFX1250-GISEL-LABEL: test_get_ddid: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -99,6 +103,7 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: test_get_tma: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -124,6 +129,7 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: test_get_realtime: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -157,6 +163,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; ; GFX1250-SDAG-LABEL: test_savewave: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -166,6 +173,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { ; ; GFX1250-GISEL-LABEL: test_savewave: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -190,6 +198,7 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: test_get_tba: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -223,6 +232,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; ; GFX1250-SDAG-LABEL: test_get_0_i32: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -232,6 +242,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { ; ; GFX1250-GISEL-LABEL: test_get_0_i32: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -256,6 +267,7 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: test_get_99999_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 @@ -281,6 +293,7 @@ define amdgpu_kernel void @test_get_136_i64(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: test_get_136_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index 1070e95cda783..47eafd53a9bd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -34,6 +34,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i ; ; GFX12-LABEL: struct_atomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -85,6 +86,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %ad ; ; GFX12-LABEL: struct_atomic_buffer_load_i32_const_idx: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -137,6 +139,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3 ; ; GFX12-LABEL: struct_atomic_buffer_load_i32_off: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -191,6 +194,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i ; ; GFX12-LABEL: struct_atomic_buffer_load_i32_soff: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -245,6 +249,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3 ; ; GFX12-LABEL: struct_atomic_buffer_load_i32_dlc: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -299,6 +304,7 @@ define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 ; ; GFX12-LABEL: struct_nonatomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -354,6 +360,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; ; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_i64: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -377,6 +384,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; ; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_i64: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -433,6 +441,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 ; ; GFX12-LABEL: struct_atomic_buffer_load_v2i16: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -559,6 +568,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; ; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -582,6 +592,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; ; GFX12-FAKE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -607,6 +618,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 ; ; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -664,6 +676,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 ; ; GFX12-LABEL: struct_atomic_buffer_load_v4i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -721,6 +734,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i ; ; GFX12-LABEL: struct_atomic_buffer_load_ptr: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll index 9abbc064803da..71e51c4322056 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll @@ -5,8 +5,8 @@ ; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX9 ; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10 ; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11 -; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12 -; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12 +; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1200 +; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1250 define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) { ; GFX67-LABEL: struct_buffer_load_i8_tfe: @@ -54,16 +54,28 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_i8_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b8 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_i8_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b8 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_i8_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b8 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { i8, i32 } @llvm.amdgcn.struct.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i8, i32 } %res, 0 store i8 %data, ptr addrspace(1) %data_addr @@ -118,16 +130,28 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_i16_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b16 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_i16_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b16 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_i16_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { i16, i32 } @llvm.amdgcn.struct.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i16, i32 } %res, 0 store i16 %data, ptr addrspace(1) %data_addr @@ -182,16 +206,28 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_f16_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b16 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_f16_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b16 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_f16_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { half, i32 } @llvm.amdgcn.struct.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { half, i32 } %res, 0 store half %data, ptr addrspace(1) %data_addr @@ -246,16 +282,28 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add ; GFX11-NEXT: global_store_b32 v[2:3], v5, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-NEXT: buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[0:1], v4, off -; GFX12-NEXT: global_store_b32 v[2:3], v5, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_mov_b32_e32 v5, v4 +; GFX1200-NEXT: buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b32 v[0:1], v4, off +; GFX1200-NEXT: global_store_b32 v[2:3], v5, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: global_store_b32 v[2:3], v5, off +; GFX1250-NEXT: s_endpgm %res = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %res, 0 store i32 %data, ptr addrspace(1) %data_addr @@ -330,16 +378,28 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_v2i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off -; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_v2i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1200-NEXT: global_store_b32 v[2:3], v6, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_v2i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1250-NEXT: global_store_b32 v[2:3], v6, off +; GFX1250-NEXT: s_endpgm %res = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %res, 0 store <2 x i32> %data, ptr addrspace(1) %data_addr @@ -414,16 +474,28 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: global_store_b32 v[2:3], v6, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_v2f32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off -; GFX12-NEXT: global_store_b32 v[2:3], v6, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_v2f32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1200-NEXT: global_store_b32 v[2:3], v6, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_v2f32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX1250-NEXT: global_store_b32 v[2:3], v6, off +; GFX1250-NEXT: s_endpgm %res = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %res, 0 store <2 x float> %data, ptr addrspace(1) %data_addr @@ -503,17 +575,30 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_v3i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off -; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_v3i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1200-NEXT: global_store_b32 v[2:3], v7, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_v3i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_mov_b32_e32 v7, v4 +; GFX1250-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1250-NEXT: global_store_b32 v[2:3], v7, off +; GFX1250-NEXT: s_endpgm %res = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %res, 0 store <3 x i32> %data, ptr addrspace(1) %data_addr @@ -593,17 +678,30 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: global_store_b32 v[2:3], v7, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_v3f32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_mov_b32_e32 v7, v4 -; GFX12-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b96 v[0:1], v[4:6], off -; GFX12-NEXT: global_store_b32 v[2:3], v7, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_v3f32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_mov_b32_e32 v7, v4 +; GFX1200-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1200-NEXT: global_store_b32 v[2:3], v7, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_v3f32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_mov_b32_e32 v7, v4 +; GFX1250-NEXT: buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b96 v[0:1], v[4:6], off +; GFX1250-NEXT: global_store_b32 v[2:3], v7, off +; GFX1250-NEXT: s_endpgm %res = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %res, 0 store <3 x float> %data, ptr addrspace(1) %data_addr @@ -670,17 +768,30 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_v4i32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 -; GFX12-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_v4i32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1200-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1200-NEXT: global_store_b32 v[2:3], v8, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_v4i32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1250-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1250-NEXT: global_store_b32 v[2:3], v8, off +; GFX1250-NEXT: s_endpgm %res = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %res, 0 store <4 x i32> %data, ptr addrspace(1) %data_addr @@ -747,17 +858,30 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a ; GFX11-NEXT: global_store_b32 v[2:3], v8, off ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_buffer_load_v4f32_tfe: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 -; GFX12-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-NEXT: global_store_b32 v[2:3], v8, off -; GFX12-NEXT: s_endpgm +; GFX1200-LABEL: struct_buffer_load_v4f32_tfe: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: v_mov_b32_e32 v4, 0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1200-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1200-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1200-NEXT: global_store_b32 v[2:3], v8, off +; GFX1200-NEXT: s_endpgm +; +; GFX1250-LABEL: struct_buffer_load_v4f32_tfe: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GFX1250-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GFX1250-NEXT: buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX1250-NEXT: global_store_b32 v[2:3], v8, off +; GFX1250-NEXT: s_endpgm %res = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %res, 0 store <4 x float> %data, ptr addrspace(1) %data_addr diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll index 822016b23c952..4bde9db509f13 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -26,6 +26,7 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, < ; ; GFX12-LABEL: buffer_store: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: v_mov_b32_e32 v12, 0 ; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: buffer_store_b128 v[0:3], v12, s[0:3], null idxen @@ -54,6 +55,7 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { ; ; GFX12-LABEL: buffer_store_immoffs: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen offset:42 ; GFX12-NEXT: s_endpgm @@ -75,6 +77,7 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { ; ; GFX12-LABEL: buffer_store_idx: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen ; GFX12-NEXT: s_endpgm main_body: @@ -101,6 +104,7 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { ; ; GFX12-LABEL: buffer_store_ofs: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0 ; GFX12-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen ; GFX12-NEXT: s_endpgm @@ -122,6 +126,7 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) ; ; GFX12-LABEL: buffer_store_both: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen ; GFX12-NEXT: s_endpgm main_body: @@ -144,6 +149,7 @@ define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, ; ; GFX12-LABEL: buffer_store_both_reversed: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v4 ; GFX12-NEXT: buffer_store_b128 v[0:3], v[6:7], s[0:3], null idxen offen ; GFX12-NEXT: s_endpgm @@ -181,6 +187,7 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, ; ; GFX12-LABEL: buffer_store_wait: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen ; GFX12-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], null idxen @@ -207,6 +214,7 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 % ; ; GFX12-LABEL: buffer_store_x1: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen ; GFX12-NEXT: s_endpgm main_body: @@ -227,6 +235,7 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, ; ; GFX12-LABEL: buffer_store_x2: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen ; GFX12-NEXT: s_endpgm main_body: @@ -254,6 +263,7 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i ; ; GFX12-LABEL: buffer_store_int: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: v_mov_b32_e32 v7, 0 ; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: buffer_store_b128 v[0:3], v7, s[0:3], null idxen @@ -282,6 +292,7 @@ define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1 ; ; GFX12-LABEL: struct_buffer_store_byte: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX12-NEXT: buffer_store_b8 v0, v1, s[0:3], null idxen ; GFX12-NEXT: s_endpgm @@ -313,12 +324,16 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, ; ; GFX12-TRUE16-LABEL: struct_buffer_store_f16: ; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX12-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX12-TRUE16-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen ; GFX12-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: struct_buffer_store_f16: ; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GFX12-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX12-FAKE16-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen ; GFX12-FAKE16-NEXT: s_endpgm @@ -349,6 +364,7 @@ define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x hal ; ; GFX12-LABEL: struct_buffer_store_v2f16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -381,6 +397,7 @@ define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x hal ; ; GFX12-LABEL: struct_buffer_store_v4f16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -402,6 +419,7 @@ define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, ; ; GFX12-LABEL: struct_buffer_store_i16: ; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX12-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen ; GFX12-NEXT: s_endpgm @@ -433,6 +451,7 @@ define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16 ; ; GFX12-LABEL: struct_buffer_store_vif16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -463,6 +482,7 @@ define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16 ; ; GFX12-LABEL: struct_buffer_store_v4i16: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen ; GFX12-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 1e4b43d1f4fce..c2c8580de937c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -34,6 +34,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -85,6 +86,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrs ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v1, 15 @@ -137,6 +139,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8 ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_off: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -191,6 +194,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace( ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_soff: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -245,6 +249,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8 ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_dlc: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -299,6 +304,7 @@ define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) ; ; GFX12-LABEL: struct_ptr_nonatomic_buffer_load_i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -354,6 +360,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; ; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -377,6 +384,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; ; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -433,6 +441,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_v2i16: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -559,6 +568,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; ; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 ; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -582,6 +592,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; ; GFX12-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-FAKE16-NEXT: s_clause 0x1 ; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -607,6 +618,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) ; ; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 ; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -664,6 +676,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_v4i32: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -721,6 +734,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p ; ; GFX12-LABEL: struct_ptr_atomic_buffer_load_ptr: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll index ba769eff08dee..42fa032c344f8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll @@ -13,6 +13,7 @@ declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0 define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 { ; SDAG-REAL16-LABEL: tanh_f32: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 @@ -22,6 +23,7 @@ define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 { ; ; SDAG-FAKE16-LABEL: tanh_f32: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -37,6 +39,7 @@ define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 { define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_f32_constant_4.0: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 4.0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -46,6 +49,7 @@ define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 4.0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -60,6 +64,7 @@ define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_f32_constant_100.0: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 0x42c80000 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -69,6 +74,7 @@ define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 0x42c80000 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -83,10 +89,12 @@ define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_undef_f32: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_endpgm ; ; SDAG-FAKE16-LABEL: tanh_undef_f32: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_endpgm %tanh = call float @llvm.amdgcn.tanh.f32(float undef) store float %tanh, ptr addrspace(1) %out, align 4 @@ -96,6 +104,7 @@ define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 { ; SDAG-REAL16-LABEL: tanh_f16: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 @@ -105,6 +114,7 @@ define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 { ; ; SDAG-FAKE16-LABEL: tanh_f16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -119,6 +129,7 @@ define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 { define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_f16_constant_4.0: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, 4.0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -128,6 +139,7 @@ define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: tanh_f16_constant_4.0: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, 4.0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -142,6 +154,7 @@ define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_f16_constant_100.0: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, 0x5640 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -151,6 +164,7 @@ define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: tanh_f16_constant_100.0: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, 0x5640 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -165,10 +179,12 @@ define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_undef_f16: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_endpgm ; ; SDAG-FAKE16-LABEL: tanh_undef_f16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_endpgm %tanh = call half @llvm.amdgcn.tanh.f16(half undef) store half %tanh, ptr addrspace(1) %out, align 2 @@ -178,6 +194,7 @@ define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; SDAG-REAL16-LABEL: tanh_bf16: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 @@ -187,6 +204,7 @@ define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; ; SDAG-FAKE16-LABEL: tanh_bf16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 @@ -201,6 +219,7 @@ define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 { define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_bf16_constant_4: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, 4.0 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -210,6 +229,7 @@ define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: tanh_bf16_constant_4: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, 4.0 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -224,6 +244,7 @@ define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_bf16_constant_100: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, 0x42c8 ; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 @@ -233,6 +254,7 @@ define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 { ; ; SDAG-FAKE16-LABEL: tanh_bf16_constant_100: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, 0x42c8 ; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 @@ -247,10 +269,12 @@ define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @tanh_undef_bf16(ptr addrspace(1) %out) #1 { ; SDAG-REAL16-LABEL: tanh_undef_bf16: ; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-REAL16-NEXT: s_endpgm ; ; SDAG-FAKE16-LABEL: tanh_undef_bf16: ; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-FAKE16-NEXT: s_endpgm %tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat undef) store bfloat %tanh, ptr addrspace(1) %out, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll index d2712ac8e08a3..ab2c0f468c1c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll @@ -10,6 +10,7 @@ declare void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %D0, <8 x i32> %D1, define amdgpu_ps void @tensor_load_to_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { ; GFX1250-LABEL: tensor_load_to_lds: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; GFX1250-NEXT: s_endpgm entry: @@ -20,6 +21,7 @@ entry: define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { ; GFX1250-SDAG-LABEL: tensor_load_to_lds_vector: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 @@ -46,6 +48,7 @@ define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, < ; ; GFX1250-GISEL-LABEL: tensor_load_to_lds_vector: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 @@ -78,6 +81,7 @@ entry: define amdgpu_ps void @tensor_load_to_lds_d2(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { ; GFX1250-LABEL: tensor_load_to_lds_d2: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm entry: @@ -88,6 +92,7 @@ entry: define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) { ; GFX1250-SDAG-LABEL: tensor_load_to_lds_d2_vector: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 @@ -106,6 +111,7 @@ define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1 ; ; GFX1250-GISEL-LABEL: tensor_load_to_lds_d2_vector: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 @@ -129,6 +135,7 @@ entry: define amdgpu_ps void @tensor_store_from_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) { ; GFX1250-LABEL: tensor_store_from_lds: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm entry: @@ -139,6 +146,7 @@ entry: define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { ; GFX1250-SDAG-LABEL: tensor_store_from_lds_vector: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 @@ -165,6 +173,7 @@ define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1 ; ; GFX1250-GISEL-LABEL: tensor_store_from_lds_vector: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 @@ -196,6 +205,7 @@ entry: define amdgpu_ps void @tensor_store_from_lds_d2(<4 x i32> inreg %D0, <8 x i32> inreg %D1) { ; GFX1250-LABEL: tensor_store_from_lds_d2: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11] ; GFX1250-NEXT: s_endpgm entry: @@ -206,6 +216,7 @@ entry: define amdgpu_ps void @tensor_store_from_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) { ; GFX1250-SDAG-LABEL: tensor_store_from_lds_d2_vector: ; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 @@ -224,6 +235,7 @@ define amdgpu_ps void @tensor_store_from_lds_d2_vector(<4 x i32> %D0, <8 x i32> ; ; GFX1250-GISEL-LABEL: tensor_store_from_lds_d2_vector: ; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll index c597693d5a5f9..c95db198da4e7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll @@ -29,6 +29,7 @@ define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) { ; ; GFX1250-LABEL: test_wave_id: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_bfe_u32 s0, ttmp8, 0x50019 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index b6c930dacf9ab..1150578a5ae92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -5,6 +5,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 @@ -13,6 +14,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off @@ -27,6 +29,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -35,6 +38,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloa ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -49,12 +53,14 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -67,12 +73,14 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: s_endpgm @@ -85,6 +93,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -93,6 +102,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -107,6 +117,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -115,6 +126,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -129,6 +141,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -137,6 +150,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -151,6 +165,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -159,6 +174,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -173,12 +189,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -191,12 +209,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -209,12 +229,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -227,12 +249,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -245,6 +269,7 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -253,6 +278,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -267,6 +293,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -275,6 +302,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -289,12 +317,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -307,6 +337,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -315,6 +346,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -329,6 +361,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -337,6 +370,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -351,6 +385,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -359,6 +394,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -373,6 +409,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -381,6 +418,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -395,6 +433,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -403,6 +442,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -417,6 +457,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -425,6 +466,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -439,6 +481,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -447,6 +490,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -461,6 +505,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -469,6 +514,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -483,6 +529,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -491,6 +538,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -505,6 +553,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -513,6 +562,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -527,6 +577,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -535,6 +586,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -549,6 +601,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -557,6 +610,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -571,6 +625,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -579,6 +634,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -593,6 +649,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -601,6 +658,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -615,6 +673,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 @@ -623,6 +682,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off @@ -637,6 +697,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -645,6 +706,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -659,6 +721,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 @@ -667,6 +730,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off @@ -681,6 +745,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -689,6 +754,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -703,6 +769,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -711,6 +778,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -725,6 +793,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 @@ -733,6 +802,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off @@ -747,6 +817,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -755,6 +826,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -769,6 +841,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 @@ -777,6 +850,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -791,6 +865,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 @@ -799,6 +874,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off @@ -813,6 +889,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 @@ -821,6 +898,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off @@ -835,6 +913,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -843,6 +922,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -857,6 +937,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 @@ -865,6 +946,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off @@ -879,6 +961,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ss: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -887,6 +970,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <1 ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ss: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -901,6 +985,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -911,6 +996,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -927,6 +1013,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 @@ -935,6 +1022,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off @@ -949,6 +1037,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -957,6 +1046,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -971,6 +1061,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -979,6 +1070,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -993,6 +1085,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1001,6 +1094,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1015,6 +1109,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 @@ -1023,6 +1118,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off @@ -1037,6 +1133,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 @@ -1045,6 +1142,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off @@ -1059,6 +1157,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -1067,6 +1166,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -1081,6 +1181,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -1089,6 +1190,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -1103,6 +1205,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1111,6 +1214,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1125,6 +1229,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -1133,6 +1238,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -1147,6 +1253,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -1155,6 +1262,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -1169,6 +1277,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1177,6 +1286,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1191,6 +1301,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1199,6 +1310,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1213,6 +1325,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 @@ -1221,6 +1334,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off @@ -1235,6 +1349,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -1243,6 +1358,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -1257,6 +1373,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 @@ -1265,6 +1382,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off @@ -1279,6 +1397,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1287,6 +1406,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1301,6 +1421,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1309,6 +1430,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1323,6 +1445,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 @@ -1331,6 +1454,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off @@ -1345,6 +1469,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1353,6 +1478,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1367,6 +1493,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -1375,6 +1502,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -1389,6 +1517,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 @@ -1397,6 +1526,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off @@ -1411,6 +1541,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 @@ -1419,6 +1550,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off @@ -1433,6 +1565,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v24, v25 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 @@ -1441,6 +1574,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v24, v25 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[26:27], v[16:19], off @@ -1455,6 +1589,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 @@ -1463,6 +1598,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off @@ -1477,6 +1613,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ss: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1485,6 +1622,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ss: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1499,6 +1637,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -1509,6 +1648,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32 ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -1525,6 +1665,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 @@ -1533,6 +1674,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off @@ -1547,6 +1689,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1555,6 +1698,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1569,6 +1713,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1577,6 +1722,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1591,6 +1737,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1599,6 +1746,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1613,6 +1761,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 @@ -1621,6 +1770,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off @@ -1635,6 +1785,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 @@ -1643,6 +1794,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off @@ -1657,6 +1809,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1665,6 +1818,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1679,6 +1833,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1687,6 +1842,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1701,6 +1857,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1709,6 +1866,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1723,6 +1881,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1731,6 +1890,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1745,6 +1905,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1753,6 +1914,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1767,6 +1929,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1775,6 +1938,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1789,6 +1953,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1797,6 +1962,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1811,6 +1977,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 @@ -1819,6 +1986,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off @@ -1833,6 +2001,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1841,6 +2010,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1855,6 +2025,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 @@ -1863,6 +2034,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off @@ -1877,6 +2049,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1885,6 +2058,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1899,6 +2073,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1907,6 +2082,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1921,6 +2097,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 @@ -1929,6 +2106,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off @@ -1943,6 +2121,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1951,6 +2130,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1965,6 +2145,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 @@ -1973,6 +2154,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off @@ -1987,6 +2169,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 @@ -1995,6 +2178,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off @@ -2009,6 +2193,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 @@ -2017,6 +2202,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off @@ -2031,6 +2217,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v[24:25], v[26:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off offset:16 @@ -2039,6 +2226,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v[24:25], v[26:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[28:29], v[16:19], off @@ -2053,12 +2241,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -2071,12 +2261,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -2089,12 +2281,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -2107,12 +2301,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -2125,6 +2321,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -2133,6 +2330,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -2147,6 +2345,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -2155,6 +2354,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -2169,6 +2369,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -2177,6 +2378,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -2191,6 +2393,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -2199,6 +2402,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -2213,6 +2417,7 @@ bb: define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -2223,6 +2428,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, < ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -2239,6 +2445,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48 @@ -2249,6 +2456,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[42:43], v[24:27], off @@ -2265,6 +2473,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ss: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -2275,6 +2484,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ss: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -2291,6 +2501,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -2303,6 +2514,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -2321,6 +2533,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:48 @@ -2331,6 +2544,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i3 ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[44:45], v[24:27], off @@ -2347,6 +2561,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -2357,6 +2572,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -2373,6 +2589,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -2385,6 +2602,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse @@ -2403,6 +2621,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2412,6 +2631,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfl ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GISEL-NEXT: s_clause 0x1 @@ -2427,6 +2647,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2434,6 +2655,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bf ; ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2447,6 +2669,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2456,6 +2679,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x ; ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GISEL-NEXT: s_clause 0x1 @@ -2471,6 +2695,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -2479,6 +2704,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -2493,6 +2719,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -2501,6 +2728,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -2515,6 +2743,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -2523,6 +2752,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -2537,6 +2767,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -2545,6 +2776,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -2559,6 +2791,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: v_mov_b32_e32 v29, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2568,6 +2801,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -2581,6 +2815,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: v_mov_b32_e32 v29, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2590,6 +2825,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -2603,6 +2839,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: v_mov_b32_e32 v29, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2612,6 +2849,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -2625,6 +2863,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: v_mov_b32_e32 v29, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2634,6 +2873,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32 ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -2647,6 +2887,7 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -2655,6 +2896,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B ; ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -2669,6 +2911,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2678,6 +2921,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34 ; GISEL-NEXT: s_clause 0x1 @@ -2693,6 +2937,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2700,6 +2945,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> ; ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index dc477992db81e..037e26087eaa5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -5,6 +5,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 @@ -13,6 +14,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off @@ -27,6 +29,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v6, 1.0 :: v_dual_mov_b32 v8, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v6 @@ -40,6 +43,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -67,6 +71,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v6, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 @@ -81,6 +86,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -109,6 +115,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -117,6 +124,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloa ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -131,6 +139,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -144,6 +153,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, < ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -163,6 +173,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -177,6 +188,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat> ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -197,12 +209,14 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -215,6 +229,7 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18 @@ -224,6 +239,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A, ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18 @@ -239,6 +255,7 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -249,6 +266,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat> ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -265,12 +283,14 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -283,6 +303,7 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -294,6 +315,7 @@ define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> % ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -311,6 +333,7 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -323,6 +346,7 @@ define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloa ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -341,6 +365,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -349,6 +374,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off @@ -363,6 +389,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -376,6 +403,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -403,6 +431,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -417,6 +446,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -445,6 +475,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -453,6 +484,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off @@ -467,6 +499,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -480,6 +513,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -507,6 +541,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -521,6 +556,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -549,6 +585,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -557,6 +594,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off @@ -571,6 +609,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -584,6 +623,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -611,6 +651,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -625,6 +666,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -653,6 +695,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -661,6 +704,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off @@ -675,6 +719,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -688,6 +733,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -715,6 +761,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -729,6 +776,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -757,12 +805,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -775,6 +825,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 @@ -785,6 +836,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -804,6 +856,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -814,6 +867,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -833,12 +887,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -851,6 +907,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 @@ -861,6 +918,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -880,6 +938,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -890,6 +949,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -909,12 +969,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -927,6 +989,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 @@ -937,6 +1000,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -956,6 +1020,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -966,6 +1031,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -985,12 +1051,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1003,6 +1071,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 @@ -1013,6 +1082,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1032,6 +1102,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -1042,6 +1113,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> % ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1061,6 +1133,7 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -1069,6 +1142,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, pt ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off @@ -1083,6 +1157,7 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1 :: v_dual_mov_b32 v20, 2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -1096,6 +1171,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1 ; GISEL-NEXT: s_mov_b32 s2, 2 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1123,6 +1199,7 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x80 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -1137,6 +1214,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, < ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_movk_i32 s0, 0x80 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1165,6 +1243,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 @@ -1173,6 +1252,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off @@ -1187,6 +1267,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 @@ -1200,6 +1281,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1227,6 +1309,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -1241,6 +1324,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1269,12 +1353,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0 ; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0 ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1287,6 +1373,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 @@ -1297,6 +1384,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1316,6 +1404,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 @@ -1326,6 +1415,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1345,6 +1435,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 @@ -1353,6 +1444,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -1367,6 +1459,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 @@ -1380,6 +1473,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, < ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1407,6 +1501,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -1421,6 +1516,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1449,6 +1545,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 @@ -1457,6 +1554,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -1471,6 +1569,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 @@ -1484,6 +1583,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1511,6 +1611,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) @@ -1526,6 +1627,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1555,6 +1657,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 @@ -1563,6 +1666,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -1577,6 +1681,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 @@ -1590,6 +1695,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i3 ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1617,6 +1723,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65 ; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64 @@ -1633,6 +1740,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -1662,12 +1770,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1680,6 +1790,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 @@ -1690,6 +1801,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1709,6 +1821,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -1719,6 +1832,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1738,12 +1852,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1756,6 +1872,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 @@ -1766,6 +1883,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1785,6 +1903,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -1795,6 +1914,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1814,12 +1934,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1832,6 +1954,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 @@ -1842,6 +1965,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1861,6 +1985,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -1871,6 +1996,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1890,12 +2016,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1908,6 +2036,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 @@ -1918,6 +2047,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1937,6 +2067,7 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -1947,6 +2078,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GISEL-NEXT: s_mov_b32 s2, s0 @@ -1966,6 +2098,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 @@ -1974,6 +2107,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -1988,6 +2122,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 @@ -2001,6 +2136,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2028,6 +2164,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -2042,6 +2179,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2070,6 +2208,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 @@ -2078,6 +2217,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -2092,6 +2232,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 @@ -2105,6 +2246,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2132,6 +2274,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -2146,6 +2289,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2174,6 +2318,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 @@ -2182,6 +2327,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -2196,6 +2342,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 @@ -2209,6 +2356,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2236,6 +2384,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -2250,6 +2399,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2278,6 +2428,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 @@ -2286,6 +2437,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -2300,6 +2452,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 @@ -2313,6 +2466,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2340,6 +2494,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 @@ -2354,6 +2509,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s6, s0 @@ -2382,6 +2538,7 @@ bb: define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0 ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 @@ -2392,6 +2549,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, p ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0 ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off @@ -2408,6 +2566,7 @@ bb: define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v28, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v29, v26 @@ -2428,6 +2587,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 ; GISEL-NEXT: s_mov_b32 s14, s0 @@ -2469,6 +2629,7 @@ bb: define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 @@ -2490,6 +2651,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_mov_b32 s14, s0 @@ -2532,6 +2694,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 @@ -2542,6 +2705,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off @@ -2558,6 +2722,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26 @@ -2578,6 +2743,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 ; GISEL-NEXT: s_mov_b32 s14, s0 @@ -2619,6 +2785,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 ; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -2641,6 +2808,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32 ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 ; GISEL-NEXT: s_mov_b32 s14, s0 @@ -2684,6 +2852,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 @@ -2694,6 +2863,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i3 ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off @@ -2710,6 +2880,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26 @@ -2730,6 +2901,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 ; GISEL-NEXT: s_mov_b32 s14, s0 @@ -2771,6 +2943,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 ; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65 ; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64 @@ -2794,6 +2967,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 ; GISEL-NEXT: s_mov_b32 s14, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll index 8f674f84206ff..eb7c15587654c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll @@ -5,6 +5,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negA(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 @@ -13,6 +14,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negA(<2 x float> %A, <2 x float ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off @@ -27,6 +29,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negB(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 @@ -35,6 +38,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negB(<2 x float> %A, <2 x float ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off @@ -49,6 +53,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 @@ -57,6 +62,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off @@ -71,6 +77,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x4_f32_neg_absC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 @@ -79,6 +86,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_neg_absC(<2 x float> %A, <2 x f ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off @@ -93,6 +101,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x4_f32_ignoreC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 @@ -101,6 +110,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_ignoreC(<2 x float> %A, <2 x fl ; ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off @@ -115,6 +125,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -123,6 +134,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -137,6 +149,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -145,6 +158,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -159,6 +173,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -167,6 +182,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -181,6 +197,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -189,6 +206,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <1 ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -203,6 +221,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -211,6 +230,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -225,12 +245,14 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -243,12 +265,14 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -261,12 +285,14 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -279,12 +305,14 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -297,12 +325,14 @@ bb: define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -315,12 +345,14 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: s_endpgm @@ -333,12 +365,14 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: s_endpgm @@ -351,12 +385,14 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: s_endpgm @@ -369,12 +405,14 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: s_endpgm @@ -387,12 +425,14 @@ bb: define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off ; GISEL-NEXT: s_endpgm @@ -405,6 +445,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -413,6 +454,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i3 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -427,6 +469,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -435,6 +478,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -449,6 +493,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -457,6 +502,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -471,6 +517,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -479,6 +526,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i3 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -493,6 +541,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -501,6 +550,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -515,6 +565,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -523,6 +574,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -537,6 +589,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -545,6 +598,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i3 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -559,6 +613,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -567,6 +622,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -581,6 +637,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -589,6 +646,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -603,6 +661,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -611,6 +670,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i3 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -625,6 +685,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -633,6 +694,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -647,6 +709,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -655,6 +718,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -669,12 +733,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -687,12 +753,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -705,12 +773,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -723,12 +793,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -741,12 +813,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -759,12 +833,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -777,12 +853,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -795,12 +873,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -813,12 +893,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -831,12 +913,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -849,12 +933,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -867,12 +953,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -885,6 +973,7 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedA(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -893,6 +982,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedA(<8 x i32> %A, <8 x i32 ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -907,6 +997,7 @@ bb: define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedB(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -915,6 +1006,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedB(<8 x i32> %A, <8 x i32 ; ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -929,6 +1021,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -937,6 +1030,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negA(<16 x half> %A, <16 x hal ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -951,6 +1045,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -959,6 +1054,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negB(<16 x half> %A, <16 x hal ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -973,6 +1069,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -981,6 +1078,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x hal ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -995,6 +1093,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -1003,6 +1102,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_neg_absC(<16 x half> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -1017,6 +1117,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 @@ -1025,6 +1126,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_ignoreC(<16 x half> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off @@ -1039,12 +1141,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -1057,12 +1161,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -1075,12 +1181,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -1093,12 +1201,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -1111,12 +1221,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off ; GISEL-NEXT: s_endpgm @@ -1129,6 +1241,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1137,6 +1250,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1151,6 +1265,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1159,6 +1274,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <1 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1173,6 +1289,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1181,6 +1298,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1195,6 +1313,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1203,6 +1322,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1217,6 +1337,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1225,6 +1346,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1239,6 +1361,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1247,6 +1370,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> % ; ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1261,6 +1385,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1269,6 +1394,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_negC(<16 x i32> %A ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1283,6 +1409,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1291,6 +1418,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC(<16 x i32 ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1305,6 +1433,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1313,6 +1442,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> ; ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1327,12 +1457,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1345,12 +1477,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1363,12 +1497,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1381,12 +1517,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1399,12 +1537,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1417,12 +1557,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1435,12 +1577,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1453,12 +1597,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1471,12 +1617,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1489,12 +1637,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1507,12 +1657,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1525,12 +1677,14 @@ bb: define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] ; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off ; GFX1250-NEXT: s_endpgm ; ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] ; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off ; GISEL-NEXT: s_endpgm @@ -1543,6 +1697,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1551,6 +1706,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1565,6 +1721,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1573,6 +1730,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, < ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1587,6 +1745,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1595,6 +1754,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <1 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1609,6 +1769,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1617,6 +1778,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1631,6 +1793,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1639,6 +1802,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, < ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1653,6 +1817,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1661,6 +1826,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <1 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1675,6 +1841,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1683,6 +1850,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1697,6 +1865,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1705,6 +1874,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, < ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1719,6 +1889,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1727,6 +1898,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <1 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1741,6 +1913,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1749,6 +1922,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1763,6 +1937,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1771,6 +1946,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, < ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1785,6 +1961,7 @@ bb: define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 @@ -1793,6 +1970,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <1 ; ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1807,6 +1985,7 @@ bb: define amdgpu_ps void @test_wmma_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1817,6 +1996,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -1833,6 +2013,7 @@ bb: define amdgpu_ps void @test_wmma_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1843,6 +2024,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -1859,6 +2041,7 @@ bb: define amdgpu_ps void @test_wmma_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1869,6 +2052,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i3 ; ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -1885,6 +2069,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1895,6 +2080,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -1911,6 +2097,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1921,6 +2108,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -1937,6 +2125,7 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1947,6 +2136,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, < ; ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -1963,6 +2153,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1973,6 +2164,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8 ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -1989,6 +2181,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -1999,6 +2192,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -2015,6 +2209,7 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 @@ -2025,6 +2220,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A, ; ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 ; GISEL-NEXT: s_clause 0x3 ; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off @@ -2041,6 +2237,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2050,6 +2247,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GISEL-NEXT: s_clause 0x1 @@ -2065,6 +2263,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2074,6 +2273,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GISEL-NEXT: s_clause 0x1 @@ -2089,6 +2289,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2096,6 +2297,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negA(<16 x bfloat> %A, <32 ; ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2109,6 +2311,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2116,6 +2319,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negB(<16 x bfloat> %A, <32 ; ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2129,6 +2333,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2138,6 +2343,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negA(<16 x bfloat> %A, ; ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GISEL-NEXT: s_clause 0x1 @@ -2153,6 +2359,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2162,6 +2369,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negB(<16 x bfloat> %A, ; ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GISEL-NEXT: s_clause 0x1 @@ -2177,6 +2385,7 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedA(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -2185,6 +2394,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedA(<8 x i32> %A, <16 x ; ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -2199,6 +2409,7 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedB(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 @@ -2207,6 +2418,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedB(<8 x i32> %A, <16 x ; ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off @@ -2221,6 +2433,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2230,6 +2443,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negA(<16 x half> %A, <32 x h ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34 ; GISEL-NEXT: s_clause 0x1 @@ -2245,6 +2459,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 ; GFX1250-NEXT: s_clause 0x1 @@ -2254,6 +2469,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negB(<16 x half> %A, <32 x h ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34 ; GISEL-NEXT: s_clause 0x1 @@ -2269,6 +2485,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negA: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2276,6 +2493,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negA(<16 x half> %A, <32 x h ; ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negA: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off @@ -2289,6 +2507,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negB: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 ; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off @@ -2296,6 +2515,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negB(<16 x half> %A, <32 x h ; ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negB: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll index b8745e0ebf480..537c43d02909b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll @@ -5,6 +5,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b32 v32, v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 @@ -15,6 +16,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfl ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b32 v32, v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 @@ -34,6 +36,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b32 v28, v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 @@ -42,6 +45,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bf ; ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b32 v28, v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 @@ -59,6 +63,7 @@ bb: define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b32 v32, v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 @@ -69,6 +74,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x ; ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b32 v32, v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 @@ -88,6 +94,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -99,6 +106,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -120,6 +128,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -131,6 +140,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -151,6 +161,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -162,6 +173,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -183,6 +195,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -194,6 +207,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -214,6 +228,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -225,6 +240,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -246,6 +262,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -257,6 +274,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -277,6 +295,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -288,6 +307,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -309,6 +329,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -320,6 +341,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -340,6 +362,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -349,6 +372,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -368,6 +392,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -377,6 +402,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -395,6 +421,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -404,6 +431,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -423,6 +451,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -432,6 +461,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -450,6 +480,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -459,6 +490,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -478,6 +510,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -487,6 +520,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -505,6 +539,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -514,6 +549,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -533,6 +569,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -542,6 +579,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 @@ -560,6 +598,7 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -571,6 +610,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i32_index(<8 x i32> %A, <16 ; ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -592,6 +632,7 @@ bb: define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -603,6 +644,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i64_index(<8 x i32> %A, <16 ; ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 @@ -623,6 +665,7 @@ bb: define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b32 v32, v[32:33], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 @@ -633,6 +676,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> ; ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b32 v32, v[32:33], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 @@ -652,6 +696,7 @@ bb: define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_load_b32 v28, v[28:29], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 @@ -660,6 +705,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> ; ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16: ; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: global_load_b32 v28, v[28:29], off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll index 0948530abdf1b..fe98261e2f96c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll @@ -9,10 +9,12 @@ declare bfloat @llvm.cos.bf16(bfloat) #0 define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; GCN-LABEL: cos_bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s3, 0x3e230000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GCN-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0] ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_cos_bf16_e32 v0, v0 @@ -26,6 +28,7 @@ define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 { define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { ; GCN-LABEL: cos_bf16_constant_4: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_cos_bf16_e32 v0, 0x3f23 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -40,6 +43,7 @@ define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { ; GCN-LABEL: cos_bf16_constant_100: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_cos_bf16_e32 v0, 0x417f ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll index c265b05813ee7..a5a70bb811a65 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll @@ -17,18 +17,25 @@ define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_data_sgpr: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_data_sgpr: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1) ret void @@ -37,18 +44,25 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:512 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_data_sgpr_offset: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x200, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_offset: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x200, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_offset: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x200, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) @@ -60,18 +74,25 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_max_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:8388607 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_data_sgpr_max_offset: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_max_offset: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_max_offset: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1) @@ -81,12 +102,14 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_min_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:-8388608 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_min_offset: ; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], 0xffffffffff800000 ; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] @@ -108,6 +131,7 @@ define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr ; ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset: ; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 ; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 @@ -128,12 +152,14 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_too_large_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_too_large_offset: ; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000 ; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 ; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm @@ -150,6 +176,7 @@ define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inre ; ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset: ; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 @@ -172,11 +199,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_global(ptr addrspace(1) %ptr) { ; GFX1250-LABEL: prefetch_data_vgpr_global: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -195,11 +224,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_flat(ptr %ptr) { ; GFX1250-LABEL: prefetch_data_vgpr_flat: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -218,11 +249,13 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_global(ptr addrspace(1) inreg %ptr, i32 %offset) { ; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_global: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -248,11 +281,13 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_flat(ptr inreg %ptr, i32 %offset) { ; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_flat: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -279,18 +314,46 @@ entry: ; Check LDS and Scratch, we cannot prefetch it define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) { -; GCN-LABEL: prefetch_data_lds: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_lds: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_lds: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_lds: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_lds: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p3(ptr addrspace(3) %ptr, i32 0, i32 0, i32 1) ret void } define amdgpu_ps void @prefetch_data_scratch(ptr addrspace(5) inreg %ptr) { -; GCN-LABEL: prefetch_data_scratch: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_data_scratch: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_data_scratch: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_data_scratch: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_scratch: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p5(ptr addrspace(5) %ptr, i32 0, i32 0, i32 1) ret void @@ -301,18 +364,25 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_flat: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_data_sgpr_flat: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_flat: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_flat: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1) ret void @@ -321,18 +391,25 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_global: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_data_sgpr_global: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_data_sgpr_global: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1) ret void @@ -341,17 +418,25 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_constant_32bit: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_mov_b32 s1, 0 -; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_mov_b32 s1, 0 +; GFX1250-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_mov_b32 s1, 0 +; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1) ret void @@ -362,16 +447,23 @@ entry: define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_inst_sgpr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_inst_sgpr: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_inst_sgpr: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_inst_sgpr: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_inst_sgpr: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0) ret void @@ -380,16 +472,23 @@ entry: define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_inst_sgpr_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_inst_sgpr_offset: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_inst_sgpr_offset: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_inst_sgpr_offset: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) @@ -401,16 +500,23 @@ entry: define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_inst_sgpr_max_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm ; -; SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset: -; SPREFETCH: ; %bb.0: ; %entry -; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0 -; SPREFETCH-NEXT: s_endpgm +; GFX1250-SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0 +; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset: ; NOSPREFETCH: ; %bb.0: ; %entry ; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0 +; GFX12-SPREFETCH-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607 tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0) @@ -420,10 +526,12 @@ entry: define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_inst_sgpr_min_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_min_offset: ; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], 0xffffffffff800000 ; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] @@ -445,6 +553,7 @@ define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr ; ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset: ; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 ; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 @@ -465,10 +574,12 @@ entry: define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) { ; GFX1250-LABEL: prefetch_inst_sgpr_too_large_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset: ; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000 ; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 ; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm @@ -485,6 +596,7 @@ define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inre ; ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset: ; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000 ; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 @@ -507,11 +619,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_flat_dev(ptr %ptr) { ; GFX1250-LABEL: prefetch_data_vgpr_flat_dev: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -530,11 +644,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_flat_se(ptr %ptr) { ; GFX1250-LABEL: prefetch_data_vgpr_flat_se: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -553,16 +669,19 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_flat_cu(ptr %ptr) { ; GL2-ONLY-LABEL: prefetch_data_vgpr_flat_cu: ; GL2-ONLY: ; %bb.0: ; %entry +; GL2-ONLY-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GL2-ONLY-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE ; GL2-ONLY-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE ; GFX1250-SPREFETCH-NEXT: s_endpgm ; ; SAFE-CU-LABEL: prefetch_data_vgpr_flat_cu: ; SAFE-CU: ; %bb.0: ; %entry +; SAFE-CU-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SAFE-CU-NEXT: flat_prefetch_b8 v[0:1] ; SAFE-CU-NEXT: s_endpgm ; @@ -583,11 +702,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_flat_offset(ptr %ptr) { ; GFX1250-LABEL: prefetch_data_vgpr_flat_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -607,11 +728,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_global_offset(ptr addrspace(1) %ptr) { ; GFX1250-LABEL: prefetch_data_vgpr_global_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -631,11 +754,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_global_saddr(ptr addrspace(1) inreg %ptr, i32 %voffset) { ; GFX1250-LABEL: prefetch_data_vgpr_global_saddr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -655,11 +780,13 @@ entry: define amdgpu_ps void @prefetch_data_vgpr_global_saddr_offset(ptr addrspace(1) inreg %ptr, i32 %voffset) { ; GFX1250-LABEL: prefetch_data_vgpr_global_saddr_offset: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm ; @@ -680,18 +807,46 @@ entry: ; Cannot prefetch I$ with flat or global instructions. define amdgpu_ps void @prefetch_inst_vgpr_global(ptr addrspace(1) %ptr) { -; GCN-LABEL: prefetch_inst_vgpr_global: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_inst_vgpr_global: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_inst_vgpr_global: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_inst_vgpr_global: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_inst_vgpr_global: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 0) ret void } define amdgpu_ps void @prefetch_inst_vgpr_flat(ptr %ptr) { -; GCN-LABEL: prefetch_inst_vgpr_flat: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: prefetch_inst_vgpr_flat: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_endpgm +; +; GFX1250-SPREFETCH-LABEL: prefetch_inst_vgpr_flat: +; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-SPREFETCH-NEXT: s_endpgm +; +; NOSPREFETCH-LABEL: prefetch_inst_vgpr_flat: +; NOSPREFETCH: ; %bb.0: ; %entry +; NOSPREFETCH-NEXT: s_endpgm +; +; GFX12-SPREFETCH-LABEL: prefetch_inst_vgpr_flat: +; GFX12-SPREFETCH: ; %bb.0: ; %entry +; GFX12-SPREFETCH-NEXT: s_endpgm entry: tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 0) ret void @@ -702,12 +857,14 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_flat_force_vector(ptr inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_flat_force_vector: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm @@ -728,12 +885,14 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_global_force_vector(ptr addrspace(1) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_global_force_vector: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm @@ -754,12 +913,14 @@ entry: define amdgpu_ps void @prefetch_data_sgpr_global_saddr_force_vector(ptr addrspace(1) inreg %ptr) { ; GFX1250-LABEL: prefetch_data_sgpr_global_saddr_force_vector: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector: ; GFX1250-SPREFETCH: ; %bb.0: ; %entry +; GFX1250-SPREFETCH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS ; GFX1250-SPREFETCH-NEXT: s_endpgm @@ -784,3 +945,6 @@ declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i3 declare void @llvm.prefetch.p4(ptr addrspace(4) nocapture readonly, i32, i32, i32) declare void @llvm.prefetch.p5(ptr addrspace(5) nocapture readonly, i32, i32, i32) declare void @llvm.prefetch.p6(ptr addrspace(6) nocapture readonly, i32, i32, i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; SPREFETCH: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll index fe8ace52c06bf..c44b12eaa6686 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll @@ -9,10 +9,12 @@ declare bfloat @llvm.sin.bf16(bfloat) #0 define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; GCN-LABEL: sin_bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GCN-NEXT: s_mov_b32 s3, 0x3e230000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; GCN-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0] ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_sin_bf16_e32 v0, v0 @@ -26,6 +28,7 @@ define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 { define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { ; GCN-LABEL: sin_bf16_constant_4: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_sin_bf16_e32 v0, 0x3f23 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -40,6 +43,7 @@ define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { ; GCN-LABEL: sin_bf16_constant_100: ; GCN: ; %bb.0: +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GCN-NEXT: v_sin_bf16_e32 v0, 0x417f ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll index 818dff4924f40..bbbf4b66cf55b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll @@ -8,6 +8,7 @@ declare <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX12-TRUE16-LABEL: sqrt_bf16: ; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -26,6 +27,7 @@ define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; ; GFX12-FAKE16-LABEL: sqrt_bf16: ; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -51,6 +53,7 @@ entry: define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX12-TRUE16-LABEL: sqrt_v2bf16: ; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 @@ -72,6 +75,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) ; ; GFX12-FAKE16-LABEL: sqrt_v2bf16: ; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 9fdc72f054f90..876940da7f575 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -78,6 +78,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace ; ; GFX1250-LABEL: constant_load_i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -161,6 +162,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v2i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -241,6 +243,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v3i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -322,6 +325,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v4i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -403,6 +407,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: constant_load_v8i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -484,6 +489,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v16i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -549,6 +555,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v32i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -617,6 +624,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v64i1: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -689,6 +697,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_zextload_i1_to_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -766,6 +775,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_sextload_i1_to_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -840,6 +850,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v1i1_to_v1i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -917,6 +928,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v1i1_to_v1i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1002,6 +1014,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v2i1_to_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1088,6 +1101,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v2i1_to_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1184,6 +1198,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v3i1_to_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v3, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1281,6 +1296,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v3i1_to_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1378,6 +1394,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v4i1_to_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1480,6 +1497,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v4i1_to_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1621,6 +1639,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v8i1_to_v8i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v8, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1767,6 +1786,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v8i1_to_v8i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -1989,6 +2009,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v16, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2222,6 +2243,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u16 s2, s[2:3], 0x0 @@ -2629,6 +2651,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v32i1_to_v32i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -3100,6 +3123,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v32i1_to_v32i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -3870,6 +3894,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v64i1_to_v64i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -4766,6 +4791,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -4970,6 +4996,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_zextload_i1_to_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -5052,6 +5079,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; ; GFX1250-LABEL: constant_sextload_i1_to_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5134,6 +5162,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v1i1_to_v1i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 @@ -5216,6 +5245,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v1i1_to_v1i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5311,6 +5341,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v2i1_to_v2i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5412,6 +5443,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v2i1_to_v2i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5530,6 +5562,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v3i1_to_v3i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v5, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5661,6 +5694,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v3i1_to_v3i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v5, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5799,6 +5833,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v4i1_to_v4i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -5948,6 +5983,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v9, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6135,6 +6171,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6367,6 +6404,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; ; GFX1250-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v16, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -6695,6 +6733,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7100,6 +7139,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v16i1_to_v16i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v32, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -7698,6 +7738,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v32i1_to_v32i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -8495,6 +8536,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v32i1_to_v32i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -9657,6 +9699,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -11206,6 +11249,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 6f7ee70812264..8812dfd9ed280 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -87,6 +87,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; ; GFX1250-LABEL: constant_load_i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -184,6 +185,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v2i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -293,6 +295,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v3i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b96 s[4:6], s[2:3], 0x0 @@ -400,6 +403,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v4i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -545,6 +549,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v8i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 @@ -724,6 +729,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: constant_load_v9i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 @@ -912,6 +918,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: constant_load_v10i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 @@ -1112,6 +1119,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: constant_load_v11i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 @@ -1310,6 +1318,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: constant_load_v12i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 @@ -1540,6 +1549,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: constant_load_v16i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 @@ -1648,6 +1658,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX1250-LABEL: constant_zextload_i32_to_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -1752,6 +1763,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; ; GFX1250-LABEL: constant_sextload_i32_to_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -1851,6 +1863,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -1955,6 +1968,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 @@ -2067,6 +2081,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -2192,6 +2207,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 @@ -2338,6 +2354,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 @@ -2515,6 +2532,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 @@ -2735,6 +2753,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 @@ -3025,6 +3044,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX1250-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 @@ -3532,6 +3552,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX1250-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 @@ -3928,6 +3949,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX1250-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 @@ -4870,6 +4892,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX1250-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 @@ -5617,6 +5640,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX1250-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b512 s[16:31], s[38:39], 0x40 @@ -6048,6 +6072,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: constant_load_v32i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 2ff69d234455f..5eb6bf6d8179a 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -66,6 +66,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; ; GFX1250-LABEL: copy_flat: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_eq_u32 s6, 0 @@ -161,6 +162,7 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp ; ; GFX1250-LABEL: copy_global: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_eq_u32 s6, 0 @@ -258,6 +260,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr ; ; GFX1250-LABEL: copy_constant: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_eq_u32 s6, 0 @@ -355,6 +358,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa ; ; GFX1250-LABEL: copy_local: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_eq_u32 s2, 0 @@ -481,6 +485,7 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; ; GFX1250-LABEL: copy_flat_divergent: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_eq_u32 s2, 0 @@ -614,6 +619,7 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; ; GFX1250-LABEL: copy_global_divergent: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll index 6b6658bd672de..48363d627839b 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll @@ -49,6 +49,7 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; ; GFX1250-SDAG-LABEL: _amdgpu_cs_main: ; GFX1250-SDAG: ; %bb.0: ; %.entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-SDAG-NEXT: s_bfe_u32 s2, ttmp6, 0x40010 ; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, 1 @@ -79,6 +80,7 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; ; GFX1250-GISEL-LABEL: _amdgpu_cs_main: ; GFX1250-GISEL: ; %bb.0: ; %.entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp6, 15 ; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1 @@ -164,6 +166,7 @@ define amdgpu_cs void @workgroup_id_no_clusters() "amdgpu-cluster-dims"="0,0,0" ; ; GFX1250-SDAG-LABEL: workgroup_id_no_clusters: ; GFX1250-SDAG: ; %bb.0: ; %.entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp7, 0xffff ; GFX1250-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 16 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0 @@ -173,6 +176,7 @@ define amdgpu_cs void @workgroup_id_no_clusters() "amdgpu-cluster-dims"="0,0,0" ; ; GFX1250-GISEL-LABEL: workgroup_id_no_clusters: ; GFX1250-GISEL: ; %bb.0: ; %.entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, ttmp9 ; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff ; GFX1250-GISEL-NEXT: s_lshr_b32 s2, ttmp7, 16 @@ -234,6 +238,7 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" { ; ; GFX1250-SDAG-LABEL: workgroup_id_optimized: ; GFX1250-SDAG: ; %bb.0: ; %.entry +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_lshr_b32 s1, ttmp7, 14 ; GFX1250-SDAG-NEXT: s_and_b32 s2, ttmp7, 0xffff ; GFX1250-SDAG-NEXT: s_and_b32 s0, ttmp6, 15 @@ -252,6 +257,7 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" { ; ; GFX1250-GISEL-LABEL: workgroup_id_optimized: ; GFX1250-GISEL: ; %bb.0: ; %.entry +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp7, 0xffff ; GFX1250-GISEL-NEXT: s_and_b32 s0, ttmp6, 15 ; GFX1250-GISEL-NEXT: s_bfe_u32 s2, ttmp6, 0x40004 @@ -333,12 +339,14 @@ define amdgpu_cs void @caller() { ; ; GFX1250-SDAG-LABEL: caller: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-SDAG-NEXT: s_and_b32 s1, ttmp6, 15 ; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, 1 ; GFX1250-SDAG-NEXT: s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4) ; GFX1250-SDAG-NEXT: s_mul_i32 s0, ttmp9, s0 -; GFX1250-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_add_co_i32 s1, s1, s0 ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, ttmp9, s1 @@ -349,12 +357,14 @@ define amdgpu_cs void @caller() { ; ; GFX1250-GISEL-LABEL: caller: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp6, 15 ; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1 ; GFX1250-GISEL-NEXT: s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4) ; GFX1250-GISEL-NEXT: s_mul_i32 s0, ttmp9, s0 -; GFX1250-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_add_co_i32 s1, s1, s0 ; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, ttmp9, s1 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll index c96ba754c0811..83625a59ed69f 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll @@ -595,6 +595,7 @@ define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bflo define amdgpu_kernel void @test_fma_mix_f32_bf16_src2_bf16lo(float %x, i32 %y, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_fma_mix_f32_bf16_src2_bf16lo: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, s0, 0, s1 op_sel_hi:[0,0,1] diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 87d52684e588c..66e3f9866268e 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1118,6 +1118,7 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; ; GFX1250-LABEL: mad_i64_i32_uniform: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -2226,6 +2227,7 @@ define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 { ; ; GFX1250-LABEL: lshr_mad_i64_sgpr: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mov_b32 s3, 0 ; GFX1250-NEXT: s_mov_b32 s2, s1 ; GFX1250-NEXT: s_mov_b64 s[4:5], 0xffffffffffff1c18 diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll index 1112be3aeac07..709679cb19184 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll @@ -26,6 +26,7 @@ define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-LABEL: mad_i32_vvv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v2 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -58,6 +59,7 @@ define amdgpu_ps float @mad_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) { ; ; GFX1250-LABEL: mad_i32_sss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: s_add_co_i32 s0, s0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -88,6 +90,7 @@ define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) { ; ; GFX1250-LABEL: mad_i32_vvc: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 42 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -118,6 +121,7 @@ define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) { ; ; GFX1250-LABEL: mad_i32_vvi: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 0x12d687 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -148,6 +152,7 @@ define amdgpu_ps float @mad_i32_vvi_neg(i32 %a, i32 %b) { ; ; GFX1250-LABEL: mad_i32_vvi_neg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, 0xffed2979 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -175,6 +180,7 @@ define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) { ; ; GFX1250-LABEL: mad_i32_vcv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, 42, v1 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, 42 @@ -202,6 +208,7 @@ define amdgpu_ps float @mad_i32_vcc(i32 %a) { ; ; GFX1250-LABEL: mad_i32_vcc: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, 42, 43 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, 42 @@ -230,6 +237,7 @@ define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) { ; ; GFX1250-LABEL: mad_i32_vvs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, s0 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -257,6 +265,7 @@ define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) { ; ; GFX1250-LABEL: mad_i32_vsv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, s0, v1 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -284,6 +293,7 @@ define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) { ; ; GFX1250-LABEL: mad_i32_svv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, s0, v0, v1 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -314,6 +324,7 @@ define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) { ; ; GFX1250-LABEL: mad_i32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, v0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -344,6 +355,7 @@ define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) { ; ; GFX1250-LABEL: mad_i32_svs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, s0, v0, s1 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -372,6 +384,7 @@ define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { ; ; GFX1250-LABEL: mad_i32_ssv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mad_u32 v0, s0, s1, v0 ; GFX1250-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b @@ -407,6 +420,7 @@ define amdgpu_ps float @mad_i32_vvv_multiuse(i32 %a, i32 %b, i32 %c) { ; ; GFX1250-LABEL: mad_i32_vvv_multiuse: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX1250-NEXT: v_add_nc_u32_e32 v0, v1, v2 ; GFX1250-NEXT: flat_store_b32 v[0:1], v1 diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index c48e25f36e99f..ac6dd30283554 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -27,6 +27,7 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imax_sge_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 @@ -100,6 +101,7 @@ define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_imax_sge_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v4, 0 @@ -166,6 +168,7 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_imax_sge_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -206,6 +209,7 @@ define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX1250-LABEL: s_test_imax_sge_imm_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -256,6 +260,7 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_imax_sge_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -327,6 +332,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX1250-LABEL: s_test_imax_sgt_imm_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -369,6 +375,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x ; ; GFX1250-LABEL: s_test_imax_sgt_imm_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -421,6 +428,7 @@ define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imax_sgt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 @@ -481,6 +489,7 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_imax_sgt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -530,6 +539,7 @@ define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umax_uge_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 @@ -590,6 +600,7 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_umax_uge_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -636,6 +647,7 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32 ; ; GFX1250-LABEL: s_test_umax_uge_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 @@ -696,6 +708,7 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_umax_uge_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -766,6 +779,7 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umax_ugt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 @@ -824,6 +838,7 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_umax_ugt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -866,6 +881,7 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x ; ; GFX1250-LABEL: s_test_umax_ugt_imm_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -914,6 +930,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspac ; ; GFX1250-LABEL: simplify_demanded_bits_test_umax_ugt_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70 @@ -977,6 +994,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace ; ; GFX1250-LABEL: simplify_demanded_bits_test_max_slt_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70 @@ -1039,6 +1057,7 @@ define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32], ; ; GFX1250-LABEL: s_test_imax_sge_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x70 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x4c @@ -1113,6 +1132,7 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umax_ugt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -1166,6 +1186,7 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umax_uge_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -1219,6 +1240,7 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imax_sgt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -1272,6 +1294,7 @@ define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imax_sge_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll index 282a7ae7ea2fd..434e71295aad2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll @@ -42,6 +42,7 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX1250-LABEL: test_s_barrier: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_barrier_signal -1 ; GFX1250-NEXT: s_barrier_wait -1 ; GFX1250-NEXT: s_endpgm @@ -101,6 +102,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; ; GFX1250-LABEL: test_s_barrier_workgroup_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_barrier_signal -1 @@ -163,6 +165,7 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; ; GFX1250-LABEL: test_s_barrier_agent_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 7efbff9c637c5..b07807aafbf1d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -82,6 +82,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX1250-LABEL: workgroup_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -163,6 +164,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX1250-LABEL: workgroup_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -249,6 +251,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -335,6 +338,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -411,6 +415,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -492,6 +497,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX1250-LABEL: workgroup_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -578,6 +584,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -664,6 +671,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -762,6 +770,7 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; GFX1250-LABEL: agent_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -851,6 +860,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX1250-LABEL: agent_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -956,6 +966,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX1250-LABEL: agent_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1062,6 +1073,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX1250-LABEL: agent_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1162,6 +1174,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; ; GFX1250-LABEL: agent_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1251,6 +1264,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX1250-LABEL: agent_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1356,6 +1370,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: agent_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1462,6 +1477,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: agent_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1564,6 +1580,7 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; GFX1250-LABEL: system_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1657,6 +1674,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX1250-LABEL: system_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1768,6 +1786,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX1250-LABEL: system_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1880,6 +1899,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX1250-LABEL: system_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1982,6 +2002,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; ; GFX1250-LABEL: system_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2075,6 +2096,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX1250-LABEL: system_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2186,6 +2208,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: system_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2298,6 +2321,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: system_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 8b0b099999f06..022ec1d601bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -80,6 +80,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX1250-LABEL: workgroup_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -153,6 +154,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX1250-LABEL: workgroup_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -226,6 +228,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -299,6 +302,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -361,6 +365,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -422,6 +427,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX1250-LABEL: workgroup_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -483,6 +489,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -544,6 +551,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -616,6 +624,7 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; GFX1250-LABEL: agent_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -689,6 +698,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX1250-LABEL: agent_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -762,6 +772,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX1250-LABEL: agent_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -835,6 +846,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX1250-LABEL: agent_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -897,6 +909,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; ; GFX1250-LABEL: agent_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -958,6 +971,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX1250-LABEL: agent_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1019,6 +1033,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: agent_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1080,6 +1095,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: agent_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1152,6 +1168,7 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; GFX1250-LABEL: system_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -1225,6 +1242,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX1250-LABEL: system_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -1298,6 +1316,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX1250-LABEL: system_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -1371,6 +1390,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX1250-LABEL: system_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: @@ -1433,6 +1453,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; ; GFX1250-LABEL: system_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1494,6 +1515,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX1250-LABEL: system_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1555,6 +1577,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: system_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1616,6 +1639,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: system_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 1cca64ad6d2b4..127788e5a4713 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -69,6 +69,7 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; ; GFX1250-LABEL: singlethread_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") acquire @@ -130,6 +131,7 @@ define amdgpu_kernel void @singlethread_release_fence() { ; ; GFX1250-LABEL: singlethread_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") release @@ -191,6 +193,7 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; ; GFX1250-LABEL: singlethread_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") acq_rel @@ -252,6 +255,7 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; ; GFX1250-LABEL: singlethread_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") seq_cst @@ -313,6 +317,7 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; ; GFX1250-LABEL: singlethread_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acquire @@ -374,6 +379,7 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; ; GFX1250-LABEL: singlethread_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") release @@ -435,6 +441,7 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: singlethread_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acq_rel @@ -496,6 +503,7 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: singlethread_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") seq_cst @@ -557,6 +565,7 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; ; GFX1250-LABEL: wavefront_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") acquire @@ -618,6 +627,7 @@ define amdgpu_kernel void @wavefront_release_fence() { ; ; GFX1250-LABEL: wavefront_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") release @@ -679,6 +689,7 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; ; GFX1250-LABEL: wavefront_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") acq_rel @@ -740,6 +751,7 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; ; GFX1250-LABEL: wavefront_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") seq_cst @@ -801,6 +813,7 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; ; GFX1250-LABEL: wavefront_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acquire @@ -862,6 +875,7 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; ; GFX1250-LABEL: wavefront_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") release @@ -923,6 +937,7 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: wavefront_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acq_rel @@ -984,6 +999,7 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: wavefront_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") seq_cst @@ -1066,6 +1082,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX1250-LABEL: workgroup_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1152,6 +1169,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX1250-LABEL: workgroup_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1243,6 +1261,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1334,6 +1353,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1410,6 +1430,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1491,6 +1512,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX1250-LABEL: workgroup_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1577,6 +1599,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1663,6 +1686,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1762,6 +1786,7 @@ define amdgpu_kernel void @cluster_acquire_fence() { ; ; GFX1250-LABEL: cluster_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE @@ -1851,6 +1876,7 @@ define amdgpu_kernel void @cluster_release_fence() { ; ; GFX1250-LABEL: cluster_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1955,6 +1981,7 @@ define amdgpu_kernel void @cluster_acq_rel_fence() { ; ; GFX1250-LABEL: cluster_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE @@ -2060,6 +2087,7 @@ define amdgpu_kernel void @cluster_seq_cst_fence() { ; ; GFX1250-LABEL: cluster_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE @@ -2159,6 +2187,7 @@ define amdgpu_kernel void @cluster_one_as_acquire_fence() { ; ; GFX1250-LABEL: cluster_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE @@ -2248,6 +2277,7 @@ define amdgpu_kernel void @cluster_one_as_release_fence() { ; ; GFX1250-LABEL: cluster_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2352,6 +2382,7 @@ define amdgpu_kernel void @cluster_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: cluster_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE @@ -2457,6 +2488,7 @@ define amdgpu_kernel void @cluster_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: cluster_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SE @@ -2556,6 +2588,7 @@ define amdgpu_kernel void @agent_acquire_fence() { ; ; GFX1250-LABEL: agent_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2645,6 +2678,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX1250-LABEL: agent_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2750,6 +2784,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX1250-LABEL: agent_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2856,6 +2891,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX1250-LABEL: agent_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2956,6 +2992,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; ; GFX1250-LABEL: agent_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3045,6 +3082,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX1250-LABEL: agent_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -3150,6 +3188,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: agent_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -3256,6 +3295,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: agent_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -3358,6 +3398,7 @@ define amdgpu_kernel void @system_acquire_fence() { ; ; GFX1250-LABEL: system_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3451,6 +3492,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX1250-LABEL: system_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3562,6 +3604,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX1250-LABEL: system_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3674,6 +3717,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX1250-LABEL: system_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3776,6 +3820,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; ; GFX1250-LABEL: system_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3869,6 +3914,7 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; ; GFX1250-LABEL: system_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -3980,6 +4026,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: system_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -4092,6 +4139,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: system_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index bf516f8b91c91..01c60243c5163 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX1250-LABEL: flat_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX1250-LABEL: flat_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -592,6 +594,7 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX1250-LABEL: flat_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -823,6 +826,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX1250-LABEL: flat_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -987,6 +991,7 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX1250-LABEL: flat_agent_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1147,6 +1152,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX1250-LABEL: flat_agent_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1331,6 +1337,7 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX1250-LABEL: flat_agent_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1518,6 +1525,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX1250-LABEL: flat_agent_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1681,6 +1689,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1872,6 +1881,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2058,6 +2068,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX1250-LABEL: flat_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2276,6 +2287,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2496,6 +2508,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2720,6 +2733,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2967,6 +2981,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3217,6 +3232,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3472,6 +3488,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3756,6 +3773,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4035,6 +4053,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4346,6 +4365,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4659,6 +4679,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4948,6 +4969,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5234,6 +5256,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5544,6 +5567,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5857,6 +5881,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6170,6 +6195,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6483,6 +6509,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6796,6 +6823,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7109,6 +7137,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7422,6 +7451,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7735,6 +7765,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8037,6 +8068,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8353,6 +8385,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8679,6 +8712,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9022,6 +9056,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9366,6 +9401,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9686,6 +9722,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10003,6 +10040,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10344,6 +10382,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10688,6 +10727,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11032,6 +11072,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11376,6 +11417,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11720,6 +11762,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12064,6 +12107,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12408,6 +12452,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12752,6 +12797,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12957,6 +13003,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX1250-LABEL: flat_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13150,6 +13197,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13368,6 +13416,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13610,6 +13659,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13775,6 +13825,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX1250-LABEL: flat_agent_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13935,6 +13986,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14119,6 +14171,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX1250-LABEL: flat_agent_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14306,6 +14359,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14469,6 +14523,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14656,6 +14711,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14842,6 +14898,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15056,6 +15113,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15272,6 +15330,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15506,6 +15565,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15764,6 +15824,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -16025,6 +16086,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -16281,6 +16343,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16561,6 +16624,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16840,6 +16904,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17147,6 +17212,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17456,6 +17522,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17741,6 +17808,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18023,6 +18091,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18329,6 +18398,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18638,6 +18708,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18947,6 +19018,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19256,6 +19328,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19565,6 +19638,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19874,6 +19948,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20183,6 +20258,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20492,6 +20568,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20794,6 +20871,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21120,6 +21198,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21447,6 +21526,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21800,6 +21880,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22155,6 +22236,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22486,6 +22568,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22814,6 +22897,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23166,6 +23250,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23521,6 +23606,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23876,6 +23962,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24231,6 +24318,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24586,6 +24674,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24941,6 +25030,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -25296,6 +25386,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -25651,6 +25742,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll index b2b71c246c97b..c4d1bb8de3d8a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll @@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_cluster_unordered_load( ; ; GFX1250-LABEL: flat_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_load( ; ; GFX1250-LABEL: flat_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -592,6 +594,7 @@ define amdgpu_kernel void @flat_cluster_acquire_load( ; ; GFX1250-LABEL: flat_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -823,6 +826,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load( ; ; GFX1250-LABEL: flat_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -987,6 +991,7 @@ define amdgpu_kernel void @flat_cluster_unordered_store( ; ; GFX1250-LABEL: flat_cluster_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1147,6 +1152,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_store( ; ; GFX1250-LABEL: flat_cluster_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1331,6 +1337,7 @@ define amdgpu_kernel void @flat_cluster_release_store( ; ; GFX1250-LABEL: flat_cluster_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1517,6 +1524,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store( ; ; GFX1250-LABEL: flat_cluster_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1679,6 +1687,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1870,6 +1879,7 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2056,6 +2066,7 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2273,6 +2284,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2492,6 +2504,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2715,6 +2728,7 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2962,6 +2976,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3211,6 +3226,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3465,6 +3481,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3749,6 +3766,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4028,6 +4046,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4338,6 +4357,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4650,6 +4670,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4938,6 +4959,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5224,6 +5246,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5534,6 +5557,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5846,6 +5870,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6158,6 +6183,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6470,6 +6496,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6782,6 +6809,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7094,6 +7122,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7406,6 +7435,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7718,6 +7748,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8019,6 +8050,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8335,6 +8367,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8661,6 +8694,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9003,6 +9037,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9346,6 +9381,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9665,6 +9701,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9982,6 +10019,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10323,6 +10361,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10666,6 +10705,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11009,6 +11049,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11352,6 +11393,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11695,6 +11737,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12038,6 +12081,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12381,6 +12425,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12724,6 +12769,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12928,6 +12974,7 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load( ; ; GFX1250-LABEL: flat_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13121,6 +13168,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13339,6 +13387,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13581,6 +13630,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13746,6 +13796,7 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store( ; ; GFX1250-LABEL: flat_cluster_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13906,6 +13957,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14090,6 +14142,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store( ; ; GFX1250-LABEL: flat_cluster_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14276,6 +14329,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14438,6 +14492,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14625,6 +14680,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14811,6 +14867,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15024,6 +15081,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15239,6 +15297,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15472,6 +15531,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15730,6 +15790,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15990,6 +16051,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -16245,6 +16307,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16525,6 +16588,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16804,6 +16868,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17110,6 +17175,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17418,6 +17484,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17702,6 +17769,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17984,6 +18052,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18290,6 +18359,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18598,6 +18668,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18906,6 +18977,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19214,6 +19286,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19522,6 +19595,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19830,6 +19904,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20138,6 +20213,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20446,6 +20522,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20747,6 +20824,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21073,6 +21151,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21400,6 +21479,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21752,6 +21832,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22106,6 +22187,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22436,6 +22518,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22764,6 +22847,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23116,6 +23200,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23470,6 +23555,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23824,6 +23910,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24178,6 +24265,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24532,6 +24620,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24886,6 +24975,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -25240,6 +25330,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -25594,6 +25685,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index bdde7c0975425..5de7d4325c8c8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -20,6 +20,7 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; ; GFX1250-LABEL: flat_last_use_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -68,6 +69,7 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; ; GFX1250-LABEL: flat_last_use_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -108,6 +110,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; ; GFX1250-LABEL: flat_last_use_and_volatile_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -139,6 +142,7 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) ; ; GFX1250-LABEL: flat_last_use_and_nontemporal_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index a2a8ce75d7fb4..c4bd4e1cf4337 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX1250-LABEL: flat_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -546,6 +547,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX1250-LABEL: flat_nontemporal_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -745,6 +747,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX1250-LABEL: flat_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -1088,6 +1091,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX1250-LABEL: flat_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -1300,6 +1304,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX1250-LABEL: flat_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 7d357922ac307..aa314f74eb74c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX1250-LABEL: flat_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX1250-LABEL: flat_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -577,6 +579,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX1250-LABEL: flat_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -770,6 +773,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -931,6 +935,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX1250-LABEL: flat_singlethread_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1091,6 +1096,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX1250-LABEL: flat_singlethread_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1251,6 +1257,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX1250-LABEL: flat_singlethread_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1411,6 +1418,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1571,6 +1579,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1731,6 +1740,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1891,6 +1901,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2051,6 +2062,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2211,6 +2223,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2415,6 +2428,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2622,6 +2636,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2829,6 +2844,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3080,6 +3096,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3333,6 +3350,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3586,6 +3604,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3839,6 +3858,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4092,6 +4112,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4345,6 +4366,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4598,6 +4620,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4851,6 +4874,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5104,6 +5128,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5357,6 +5382,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5610,6 +5636,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5863,6 +5890,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6116,6 +6144,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6369,6 +6398,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6622,6 +6652,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6919,6 +6950,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7220,6 +7252,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7521,6 +7554,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7822,6 +7856,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8123,6 +8158,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8424,6 +8460,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8725,6 +8762,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9026,6 +9064,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9327,6 +9366,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9628,6 +9668,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9929,6 +9970,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10230,6 +10272,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10531,6 +10574,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10832,6 +10876,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11133,6 +11178,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11334,6 +11380,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX1250-LABEL: flat_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11527,6 +11574,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11720,6 +11768,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11913,6 +11962,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12074,6 +12124,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX1250-LABEL: flat_singlethread_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12234,6 +12285,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12394,6 +12446,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX1250-LABEL: flat_singlethread_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12554,6 +12607,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12714,6 +12768,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12874,6 +12929,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13034,6 +13090,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13194,6 +13251,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13354,6 +13412,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13558,6 +13617,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13765,6 +13825,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13972,6 +14033,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14223,6 +14285,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14476,6 +14539,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14729,6 +14793,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14982,6 +15047,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15235,6 +15301,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15488,6 +15555,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15741,6 +15809,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15994,6 +16063,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16247,6 +16317,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16500,6 +16571,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16753,6 +16825,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17006,6 +17079,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17259,6 +17333,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17512,6 +17587,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17765,6 +17841,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18062,6 +18139,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18363,6 +18441,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18664,6 +18743,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18965,6 +19045,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19266,6 +19347,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19567,6 +19649,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19868,6 +19951,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20169,6 +20253,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20470,6 +20555,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20771,6 +20857,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21072,6 +21159,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21373,6 +21461,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21674,6 +21763,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21975,6 +22065,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22276,6 +22367,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index d5b37650ae9cc..1ee6067ec1f2c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX1250-LABEL: flat_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX1250-LABEL: flat_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -594,6 +596,7 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX1250-LABEL: flat_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -827,6 +830,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX1250-LABEL: flat_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -991,6 +995,7 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX1250-LABEL: flat_system_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1151,6 +1156,7 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX1250-LABEL: flat_system_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1339,6 +1345,7 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX1250-LABEL: flat_system_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1530,6 +1537,7 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX1250-LABEL: flat_system_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1693,6 +1701,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1886,6 +1895,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2076,6 +2086,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX1250-LABEL: flat_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2300,6 +2311,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2526,6 +2538,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2752,6 +2765,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3005,6 +3019,7 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3261,6 +3276,7 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3516,6 +3532,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3802,6 +3819,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4085,6 +4103,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4402,6 +4421,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4721,6 +4741,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5012,6 +5033,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5300,6 +5322,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5616,6 +5639,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5935,6 +5959,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6254,6 +6279,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6573,6 +6599,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6892,6 +6919,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7211,6 +7239,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7530,6 +7559,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7849,6 +7879,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8151,6 +8182,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8469,6 +8501,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8799,6 +8832,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9148,6 +9182,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9498,6 +9533,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9820,6 +9856,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10139,6 +10176,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10486,6 +10524,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10836,6 +10875,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11186,6 +11226,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11536,6 +11577,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11886,6 +11928,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12236,6 +12279,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12586,6 +12630,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12936,6 +12981,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13141,6 +13187,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX1250-LABEL: flat_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13334,6 +13381,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13554,6 +13602,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX1250-LABEL: flat_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13798,6 +13847,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13963,6 +14013,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX1250-LABEL: flat_system_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14123,6 +14174,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14311,6 +14363,7 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX1250-LABEL: flat_system_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14502,6 +14555,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -14665,6 +14719,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14854,6 +14909,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15044,6 +15100,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15264,6 +15321,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15486,6 +15544,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15722,6 +15781,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15986,6 +16046,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -16253,6 +16314,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -16509,6 +16571,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16791,6 +16854,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17074,6 +17138,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17387,6 +17452,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17702,6 +17768,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17989,6 +18056,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18273,6 +18341,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18585,6 +18654,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18900,6 +18970,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19215,6 +19286,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19530,6 +19602,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19845,6 +19918,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20160,6 +20234,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20475,6 +20550,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20790,6 +20866,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21092,6 +21169,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21420,6 +21498,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21751,6 +21830,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22110,6 +22190,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22471,6 +22552,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22804,6 +22886,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23134,6 +23217,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23492,6 +23576,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23853,6 +23938,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24214,6 +24300,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24575,6 +24662,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -24936,6 +25024,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -25297,6 +25386,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -25658,6 +25748,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -26019,6 +26110,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index c59b0ee83e955..3c69363ad8d5b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -147,6 +147,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX1250-LABEL: flat_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -414,6 +415,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX1250-LABEL: flat_nontemporal_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -577,6 +579,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX1250-LABEL: flat_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -841,6 +844,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX1250-LABEL: flat_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -996,6 +1000,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX1250-LABEL: flat_volatile_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -1131,6 +1136,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX1250-LABEL: flat_volatile_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index b8e324ff5f458..f5f2f5b403a76 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX1250-LABEL: flat_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX1250-LABEL: flat_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -577,6 +579,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX1250-LABEL: flat_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -770,6 +773,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -931,6 +935,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX1250-LABEL: flat_wavefront_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1091,6 +1096,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX1250-LABEL: flat_wavefront_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1251,6 +1257,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX1250-LABEL: flat_wavefront_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1411,6 +1418,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1571,6 +1579,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1731,6 +1740,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1891,6 +1901,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2051,6 +2062,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2211,6 +2223,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2415,6 +2428,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2622,6 +2636,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2829,6 +2844,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3080,6 +3096,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3333,6 +3350,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3586,6 +3604,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3839,6 +3858,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4092,6 +4112,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4345,6 +4366,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4598,6 +4620,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4851,6 +4874,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5104,6 +5128,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5357,6 +5382,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5610,6 +5636,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5863,6 +5890,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6116,6 +6144,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6369,6 +6398,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6622,6 +6652,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6919,6 +6950,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7220,6 +7252,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7521,6 +7554,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7822,6 +7856,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8123,6 +8158,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8424,6 +8460,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8725,6 +8762,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9026,6 +9064,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9327,6 +9366,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9628,6 +9668,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9929,6 +9970,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10230,6 +10272,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10531,6 +10574,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10832,6 +10876,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11133,6 +11178,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11334,6 +11380,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX1250-LABEL: flat_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11527,6 +11574,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11720,6 +11768,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11913,6 +11962,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12074,6 +12124,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX1250-LABEL: flat_wavefront_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12234,6 +12285,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12394,6 +12446,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX1250-LABEL: flat_wavefront_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12554,6 +12607,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12714,6 +12768,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12874,6 +12929,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13034,6 +13090,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13194,6 +13251,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13354,6 +13412,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13558,6 +13617,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13765,6 +13825,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13972,6 +14033,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14223,6 +14285,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14476,6 +14539,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14729,6 +14793,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14982,6 +15047,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15235,6 +15301,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15488,6 +15555,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15741,6 +15809,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15994,6 +16063,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16247,6 +16317,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16500,6 +16571,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16753,6 +16825,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17006,6 +17079,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17259,6 +17333,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17512,6 +17587,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17765,6 +17841,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18062,6 +18139,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18363,6 +18441,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18664,6 +18743,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18965,6 +19045,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19266,6 +19347,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19567,6 +19649,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19868,6 +19951,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20169,6 +20253,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20470,6 +20555,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20771,6 +20857,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21072,6 +21159,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21373,6 +21461,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21674,6 +21763,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21975,6 +22065,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index d44e7fff2359f..11883bff6f269 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX1250-LABEL: flat_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX1250-LABEL: flat_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -589,6 +591,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX1250-LABEL: flat_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -813,6 +816,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -976,6 +980,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX1250-LABEL: flat_workgroup_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1136,6 +1141,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX1250-LABEL: flat_workgroup_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1318,6 +1324,7 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX1250-LABEL: flat_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1502,6 +1509,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1664,6 +1672,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1843,6 +1852,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2026,6 +2036,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2231,6 +2242,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2437,6 +2449,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2656,6 +2669,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2894,6 +2908,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3134,6 +3149,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3387,6 +3403,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3659,6 +3676,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3935,6 +3953,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4233,6 +4252,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4532,6 +4552,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4807,6 +4828,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5080,6 +5102,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5377,6 +5400,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5676,6 +5700,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5975,6 +6000,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6274,6 +6300,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6574,6 +6601,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6887,6 +6915,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7210,6 +7239,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7544,6 +7574,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7878,6 +7909,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8193,6 +8225,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8506,6 +8539,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8838,6 +8872,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9172,6 +9207,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9506,6 +9542,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9843,6 +9880,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10180,6 +10218,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10514,6 +10553,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10848,6 +10888,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11182,6 +11223,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11385,6 +11427,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX1250-LABEL: flat_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11578,6 +11621,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11779,6 +11823,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12001,6 +12046,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12164,6 +12210,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX1250-LABEL: flat_workgroup_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12324,6 +12371,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12502,6 +12550,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12682,6 +12731,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12844,6 +12894,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13014,6 +13065,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13193,6 +13245,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13386,6 +13439,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13580,6 +13634,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13795,6 +13850,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14031,6 +14087,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14269,6 +14326,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14522,6 +14580,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14785,6 +14844,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15057,6 +15117,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15343,6 +15404,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15630,6 +15692,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15896,6 +15959,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16160,6 +16224,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16445,6 +16510,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16732,6 +16798,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17019,6 +17086,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17303,6 +17371,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17587,6 +17656,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17874,6 +17944,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18161,6 +18232,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18448,6 +18520,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18748,6 +18821,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19057,6 +19131,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19376,6 +19451,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19708,6 +19784,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20040,6 +20117,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20351,6 +20429,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20660,6 +20739,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20990,6 +21070,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21322,6 +21403,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21654,6 +21736,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21983,6 +22066,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22312,6 +22396,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22644,6 +22729,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22976,6 +23062,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23308,6 +23395,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 493f985c84701..74fd60a259ef9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -194,6 +194,7 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX1250-LABEL: global_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -389,6 +390,7 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX1250-LABEL: global_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -600,6 +602,7 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX1250-LABEL: global_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -827,6 +830,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX1250-LABEL: global_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -998,6 +1002,7 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX1250-LABEL: global_agent_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1165,6 +1170,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX1250-LABEL: global_agent_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,6 +1363,7 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX1250-LABEL: global_agent_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1552,6 +1559,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX1250-LABEL: global_agent_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1720,6 +1728,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1914,6 +1923,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX1250-LABEL: global_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2106,6 +2116,7 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX1250-LABEL: global_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2328,6 +2339,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2552,6 +2564,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2766,6 +2779,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3004,6 +3018,7 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3245,6 +3260,7 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3481,6 +3497,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3744,6 +3761,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4005,6 +4023,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4296,6 +4315,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4589,6 +4609,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4857,6 +4878,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5122,6 +5144,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5412,6 +5435,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5705,6 +5729,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5998,6 +6023,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6291,6 +6317,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6584,6 +6611,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6877,6 +6905,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7170,6 +7199,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7463,6 +7493,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7732,6 +7763,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8016,6 +8048,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8310,6 +8343,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8622,6 +8656,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8935,6 +8970,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9223,6 +9259,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9508,6 +9545,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9818,6 +9856,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10131,6 +10170,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10444,6 +10484,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10757,6 +10798,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11070,6 +11112,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11383,6 +11426,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11696,6 +11740,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12009,6 +12054,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12216,6 +12262,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX1250-LABEL: global_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12411,6 +12458,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12622,6 +12670,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX1250-LABEL: global_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12849,6 +12898,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -13020,6 +13070,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX1250-LABEL: global_agent_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13187,6 +13238,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13379,6 +13431,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX1250-LABEL: global_agent_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13574,6 +13627,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13742,6 +13796,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13936,6 +13991,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14128,6 +14184,7 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14350,6 +14407,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14574,6 +14632,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14788,6 +14847,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15026,6 +15086,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15267,6 +15328,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15503,6 +15565,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15766,6 +15829,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16027,6 +16091,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16318,6 +16383,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16611,6 +16677,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16879,6 +16946,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17144,6 +17212,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17434,6 +17503,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17727,6 +17797,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18020,6 +18091,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18313,6 +18385,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18606,6 +18679,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18899,6 +18973,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19192,6 +19267,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19485,6 +19561,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19754,6 +19831,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20038,6 +20116,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20348,6 +20427,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20661,6 +20741,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20949,6 +21030,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21234,6 +21316,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21544,6 +21627,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21857,6 +21941,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22170,6 +22255,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22483,6 +22569,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22796,6 +22883,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23109,6 +23197,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23422,6 +23511,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23735,6 +23825,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll index 40257df684990..d0d527eb363ea 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll @@ -194,6 +194,7 @@ define amdgpu_kernel void @global_cluster_unordered_load( ; ; GFX1250-LABEL: global_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -389,6 +390,7 @@ define amdgpu_kernel void @global_cluster_monotonic_load( ; ; GFX1250-LABEL: global_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -600,6 +602,7 @@ define amdgpu_kernel void @global_cluster_acquire_load( ; ; GFX1250-LABEL: global_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -827,6 +830,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_load( ; ; GFX1250-LABEL: global_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -998,6 +1002,7 @@ define amdgpu_kernel void @global_cluster_unordered_store( ; ; GFX1250-LABEL: global_cluster_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1165,6 +1170,7 @@ define amdgpu_kernel void @global_cluster_monotonic_store( ; ; GFX1250-LABEL: global_cluster_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1357,6 +1363,7 @@ define amdgpu_kernel void @global_cluster_release_store( ; ; GFX1250-LABEL: global_cluster_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1551,6 +1558,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_store( ; ; GFX1250-LABEL: global_cluster_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1718,6 +1726,7 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1912,6 +1921,7 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw( ; ; GFX1250-LABEL: global_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2104,6 +2114,7 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw( ; ; GFX1250-LABEL: global_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2325,6 +2336,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2548,6 +2560,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2761,6 +2774,7 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_cluster_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2999,6 +3013,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_cluster_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3239,6 +3254,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_cluster_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3474,6 +3490,7 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3737,6 +3754,7 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3998,6 +4016,7 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4288,6 +4307,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4580,6 +4600,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4847,6 +4868,7 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5112,6 +5134,7 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5402,6 +5425,7 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5694,6 +5718,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5986,6 +6011,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6278,6 +6304,7 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6570,6 +6597,7 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6862,6 +6890,7 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7154,6 +7183,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7446,6 +7476,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7714,6 +7745,7 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7998,6 +8030,7 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8292,6 +8325,7 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8603,6 +8637,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8915,6 +8950,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9202,6 +9238,7 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9487,6 +9524,7 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9797,6 +9835,7 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10109,6 +10148,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10421,6 +10461,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10733,6 +10774,7 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11045,6 +11087,7 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11357,6 +11400,7 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11669,6 +11713,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11981,6 +12026,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -12187,6 +12233,7 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_load( ; ; GFX1250-LABEL: global_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12382,6 +12429,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_load( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12593,6 +12641,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_load( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12820,6 +12869,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12991,6 +13041,7 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_store( ; ; GFX1250-LABEL: global_cluster_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13158,6 +13209,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_store( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13350,6 +13402,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_store( ; ; GFX1250-LABEL: global_cluster_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13544,6 +13597,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_store( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13711,6 +13765,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13905,6 +13960,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14097,6 +14153,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14318,6 +14375,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14541,6 +14599,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14754,6 +14813,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14992,6 +15052,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15232,6 +15293,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -15467,6 +15529,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15730,6 +15793,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15991,6 +16055,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16281,6 +16346,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16573,6 +16639,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16840,6 +16907,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17105,6 +17173,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17395,6 +17464,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17687,6 +17757,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17979,6 +18050,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18271,6 +18343,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18563,6 +18636,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18855,6 +18929,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19147,6 +19222,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19439,6 +19515,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19707,6 +19784,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19991,6 +20069,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20301,6 +20380,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20613,6 +20693,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20900,6 +20981,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21185,6 +21267,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21495,6 +21578,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21807,6 +21891,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22119,6 +22204,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22431,6 +22517,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22743,6 +22830,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23055,6 +23143,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23367,6 +23456,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -23679,6 +23769,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index ca7802d295e0b..5082378c6b770 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -18,6 +18,7 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr ; ; GFX1250-LABEL: global_last_use_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -53,6 +54,7 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; ; GFX1250-LABEL: global_last_use_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -89,6 +91,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; ; GFX1250-LABEL: global_last_use_and_volatile_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -123,6 +126,7 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; ; GFX1250-LABEL: global_last_use_and_nontemporal_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 72cbbc0283545..d0328484e159d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -193,6 +193,7 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX1250-LABEL: global_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -458,6 +459,7 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX1250-LABEL: global_nontemporal_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -658,6 +660,7 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX1250-LABEL: global_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -897,6 +900,7 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX1250-LABEL: global_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1101,6 +1105,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX1250-LABEL: global_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index e7f7b1d196be7..3e54b23c82f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -194,6 +194,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX1250-LABEL: global_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -389,6 +390,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX1250-LABEL: global_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -584,6 +586,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX1250-LABEL: global_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -779,6 +782,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX1250-LABEL: global_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -947,6 +951,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX1250-LABEL: global_singlethread_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1114,6 +1119,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX1250-LABEL: global_singlethread_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1281,6 +1287,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX1250-LABEL: global_singlethread_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1448,6 +1455,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX1250-LABEL: global_singlethread_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1613,6 +1621,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1778,6 +1787,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1943,6 +1953,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2108,6 +2119,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2273,6 +2285,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2466,6 +2479,7 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2662,6 +2676,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2858,6 +2873,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3090,6 +3106,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3324,6 +3341,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3558,6 +3576,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3792,6 +3811,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4026,6 +4046,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4260,6 +4281,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4494,6 +4516,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4728,6 +4751,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4962,6 +4986,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5196,6 +5221,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5430,6 +5456,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5664,6 +5691,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5898,6 +5926,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6132,6 +6161,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6366,6 +6396,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6630,6 +6661,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6898,6 +6930,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7166,6 +7199,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7434,6 +7468,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7702,6 +7737,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7970,6 +8006,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8238,6 +8275,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8506,6 +8544,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8774,6 +8813,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9042,6 +9082,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9310,6 +9351,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9578,6 +9620,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9846,6 +9889,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10114,6 +10158,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10382,6 +10427,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10585,6 +10631,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX1250-LABEL: global_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -10780,6 +10827,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -10975,6 +11023,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11170,6 +11219,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11338,6 +11388,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX1250-LABEL: global_singlethread_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -11505,6 +11556,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -11672,6 +11724,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX1250-LABEL: global_singlethread_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -11839,6 +11892,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12004,6 +12058,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12169,6 +12224,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12334,6 +12390,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12499,6 +12556,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12664,6 +12722,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12857,6 +12916,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13053,6 +13113,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13249,6 +13310,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13481,6 +13543,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13715,6 +13778,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13949,6 +14013,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14183,6 +14248,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14417,6 +14483,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14651,6 +14718,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14885,6 +14953,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15119,6 +15188,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15353,6 +15423,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15587,6 +15658,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15821,6 +15893,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16055,6 +16128,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16289,6 +16363,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16523,6 +16598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16757,6 +16833,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17021,6 +17098,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17289,6 +17367,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17557,6 +17636,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17825,6 +17905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18093,6 +18174,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18361,6 +18443,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18629,6 +18712,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18897,6 +18981,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19165,6 +19250,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19433,6 +19519,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19701,6 +19788,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19969,6 +20057,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20237,6 +20326,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20505,6 +20595,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20773,6 +20864,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index ee5a8bf742fe7..8155f1b9843a2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -194,6 +194,7 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX1250-LABEL: global_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -389,6 +390,7 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX1250-LABEL: global_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -602,6 +604,7 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX1250-LABEL: global_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -831,6 +834,7 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX1250-LABEL: global_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -1002,6 +1006,7 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX1250-LABEL: global_system_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1169,6 +1174,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX1250-LABEL: global_system_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1365,6 +1371,7 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX1250-LABEL: global_system_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1564,6 +1571,7 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX1250-LABEL: global_system_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1732,6 +1740,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1928,6 +1937,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX1250-LABEL: global_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2124,6 +2134,7 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX1250-LABEL: global_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2352,6 +2363,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2582,6 +2594,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2798,6 +2811,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_system_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3042,6 +3056,7 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3289,6 +3304,7 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3525,6 +3541,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3790,6 +3807,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4055,6 +4073,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4352,6 +4371,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4651,6 +4671,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4921,6 +4942,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5188,6 +5210,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5484,6 +5507,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5783,6 +5807,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6082,6 +6107,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6381,6 +6407,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6650,6 +6677,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6936,6 +6964,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7252,6 +7281,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7571,6 +7601,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7861,6 +7892,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8148,6 +8180,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8464,6 +8497,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8783,6 +8817,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9102,6 +9137,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9421,6 +9457,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9740,6 +9777,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10059,6 +10097,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10378,6 +10417,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10697,6 +10737,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10904,6 +10945,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX1250-LABEL: global_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11099,6 +11141,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX1250-LABEL: global_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11312,6 +11355,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX1250-LABEL: global_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11541,6 +11585,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11712,6 +11757,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX1250-LABEL: global_system_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -11879,6 +11925,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX1250-LABEL: global_system_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12075,6 +12122,7 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX1250-LABEL: global_system_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12274,6 +12322,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12442,6 +12491,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12638,6 +12688,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12834,6 +12885,7 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13062,6 +13114,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13292,6 +13345,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13508,6 +13562,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13752,6 +13807,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13999,6 +14055,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14235,6 +14292,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14500,6 +14558,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14765,6 +14824,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15062,6 +15122,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15361,6 +15422,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15631,6 +15693,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15898,6 +15961,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16194,6 +16258,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16493,6 +16558,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16792,6 +16858,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17091,6 +17158,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17390,6 +17458,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17689,6 +17758,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17988,6 +18058,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18287,6 +18358,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18556,6 +18628,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18842,6 +18915,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19140,6 +19214,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19458,6 +19533,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19777,6 +19853,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20067,6 +20144,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20354,6 +20432,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20670,6 +20749,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20989,6 +21069,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21308,6 +21389,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21627,6 +21709,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21946,6 +22029,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22265,6 +22349,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22584,6 +22669,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22903,6 +22989,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 2a40ee532be98..0b240887e4025 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -150,6 +150,7 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX1250-LABEL: global_volatile_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -354,6 +355,7 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX1250-LABEL: global_volatile_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -525,6 +527,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX1250-LABEL: global_volatile_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -724,6 +727,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX1250-LABEL: global_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -884,6 +888,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX1250-LABEL: global_volatile_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -1031,6 +1036,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX1250-LABEL: global_volatile_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 09eb062d876f6..7f68ab6b3faf2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -194,6 +194,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX1250-LABEL: global_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -389,6 +390,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX1250-LABEL: global_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -584,6 +586,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX1250-LABEL: global_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -779,6 +782,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX1250-LABEL: global_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -947,6 +951,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX1250-LABEL: global_wavefront_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1114,6 +1119,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX1250-LABEL: global_wavefront_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1281,6 +1287,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX1250-LABEL: global_wavefront_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1448,6 +1455,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX1250-LABEL: global_wavefront_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1613,6 +1621,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1778,6 +1787,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1943,6 +1953,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2108,6 +2119,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2273,6 +2285,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2466,6 +2479,7 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2662,6 +2676,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2858,6 +2873,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3090,6 +3106,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3324,6 +3341,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3558,6 +3576,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3792,6 +3811,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4026,6 +4046,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4260,6 +4281,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4494,6 +4516,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4728,6 +4751,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4962,6 +4986,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5196,6 +5221,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5430,6 +5456,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5664,6 +5691,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5898,6 +5926,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6132,6 +6161,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6366,6 +6396,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6630,6 +6661,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6898,6 +6930,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7166,6 +7199,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7434,6 +7468,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7702,6 +7737,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7970,6 +8006,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8238,6 +8275,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8506,6 +8544,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8774,6 +8813,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9042,6 +9082,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9310,6 +9351,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9578,6 +9620,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9846,6 +9889,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10114,6 +10158,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10382,6 +10427,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10585,6 +10631,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX1250-LABEL: global_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -10780,6 +10827,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -10975,6 +11023,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11170,6 +11219,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11338,6 +11388,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX1250-LABEL: global_wavefront_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -11505,6 +11556,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -11672,6 +11724,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX1250-LABEL: global_wavefront_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -11839,6 +11892,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12004,6 +12058,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12169,6 +12224,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12334,6 +12390,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12499,6 +12556,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12664,6 +12722,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -12857,6 +12916,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13053,6 +13113,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13249,6 +13310,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13481,6 +13543,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13715,6 +13778,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -13949,6 +14013,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14183,6 +14248,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14417,6 +14483,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14651,6 +14718,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14885,6 +14953,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15119,6 +15188,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15353,6 +15423,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15587,6 +15658,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15821,6 +15893,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16055,6 +16128,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16289,6 +16363,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16523,6 +16598,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16757,6 +16833,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17021,6 +17098,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17289,6 +17367,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17557,6 +17636,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17825,6 +17905,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18093,6 +18174,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18361,6 +18443,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18629,6 +18712,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18897,6 +18981,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19165,6 +19250,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19433,6 +19519,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19701,6 +19788,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19969,6 +20057,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20237,6 +20326,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20505,6 +20595,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20773,6 +20864,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 868b438151558..3a36890e5088b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -194,6 +194,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX1250-LABEL: global_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -389,6 +390,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX1250-LABEL: global_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -589,6 +591,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX1250-LABEL: global_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -804,6 +807,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX1250-LABEL: global_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -974,6 +978,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX1250-LABEL: global_workgroup_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1141,6 +1146,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX1250-LABEL: global_workgroup_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1331,6 +1337,7 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX1250-LABEL: global_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1523,6 +1530,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX1250-LABEL: global_workgroup_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -1690,6 +1698,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -1865,6 +1874,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2054,6 +2064,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2257,6 +2268,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2461,6 +2473,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2662,6 +2675,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -2886,6 +2900,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3112,6 +3127,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -3346,6 +3362,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3590,6 +3607,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -3848,6 +3866,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4120,6 +4139,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4393,6 +4413,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4640,6 +4661,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -4885,6 +4907,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5156,6 +5179,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5429,6 +5453,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5702,6 +5727,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -5972,6 +5998,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6242,6 +6269,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6515,6 +6543,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -6788,6 +6817,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7061,6 +7091,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7328,6 +7359,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7601,6 +7633,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -7892,6 +7925,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8190,6 +8224,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8488,6 +8523,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -8763,6 +8799,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9036,6 +9073,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9332,6 +9370,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9630,6 +9669,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -9928,6 +9968,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10226,6 +10267,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10524,6 +10566,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -10822,6 +10865,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11120,6 +11164,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11418,6 +11463,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -11623,6 +11669,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX1250-LABEL: global_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -11818,6 +11865,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12018,6 +12066,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12230,6 +12279,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -12400,6 +12450,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX1250-LABEL: global_workgroup_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12567,6 +12618,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12752,6 +12804,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX1250-LABEL: global_workgroup_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -12939,6 +12992,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -13106,6 +13160,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13281,6 +13336,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13465,6 +13521,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13663,6 +13720,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -13862,6 +13920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14063,6 +14122,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14282,6 +14342,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14503,6 +14564,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 @@ -14737,6 +14799,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -14981,6 +15044,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15234,6 +15298,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15501,6 +15566,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -15769,6 +15835,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16016,6 +16083,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16261,6 +16329,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16527,6 +16596,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -16795,6 +16865,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17063,6 +17134,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17328,6 +17400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17593,6 +17666,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -17861,6 +17935,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18129,6 +18204,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18397,6 +18473,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18664,6 +18741,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -18937,6 +19015,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19223,6 +19302,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19516,6 +19596,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -19809,6 +19890,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20084,6 +20166,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20357,6 +20440,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20648,6 +20732,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -20941,6 +21026,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21234,6 +21320,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21527,6 +21614,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -21820,6 +21908,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22113,6 +22202,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22406,6 +22496,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 @@ -22699,6 +22790,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 712109d2f67f9..336b980e544cf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -181,6 +181,7 @@ define amdgpu_kernel void @local_agent_unordered_load( ; ; GFX1250-LABEL: local_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +365,7 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; ; GFX1250-LABEL: local_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -552,6 +554,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; ; GFX1250-LABEL: local_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -763,6 +766,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; ; GFX1250-LABEL: local_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -918,6 +922,7 @@ define amdgpu_kernel void @local_agent_unordered_store( ; ; GFX1250-LABEL: local_agent_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; ; GFX1250-LABEL: local_agent_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_agent_release_store( ; ; GFX1250-LABEL: local_agent_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; ; GFX1250-LABEL: local_agent_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; ; GFX1250-LABEL: local_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; ; GFX1250-LABEL: local_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; ; GFX1250-LABEL: local_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; ; GFX1250-LABEL: local_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; ; GFX1250-LABEL: local_agent_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; ; GFX1250-LABEL: local_agent_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll index 6d1e4e6a96119..7d11fd74ec863 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll @@ -181,6 +181,7 @@ define amdgpu_kernel void @local_cluster_unordered_load( ; ; GFX1250-LABEL: local_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +365,7 @@ define amdgpu_kernel void @local_cluster_monotonic_load( ; ; GFX1250-LABEL: local_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -552,6 +554,7 @@ define amdgpu_kernel void @local_cluster_acquire_load( ; ; GFX1250-LABEL: local_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -763,6 +766,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_load( ; ; GFX1250-LABEL: local_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -918,6 +922,7 @@ define amdgpu_kernel void @local_cluster_unordered_store( ; ; GFX1250-LABEL: local_cluster_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_cluster_monotonic_store( ; ; GFX1250-LABEL: local_cluster_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_cluster_release_store( ; ; GFX1250-LABEL: local_cluster_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_store( ; ; GFX1250-LABEL: local_cluster_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_cluster_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw( ; ; GFX1250-LABEL: local_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw( ; ; GFX1250-LABEL: local_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_cluster_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_cluster_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_cluster_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_load( ; ; GFX1250-LABEL: local_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_load( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_load( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_load( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store( ; ; GFX1250-LABEL: local_cluster_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_store( ; ; GFX1250-LABEL: local_cluster_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 9888204b997a9..550afd4141742 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -197,6 +197,7 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX1250-LABEL: local_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -428,6 +429,7 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX1250-LABEL: local_nontemporal_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -612,6 +614,7 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; ; GFX1250-LABEL: local_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -813,6 +816,7 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; ; GFX1250-LABEL: local_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1017,6 +1021,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX1250-LABEL: local_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index 1800acbbf605b..e72a28b3efc66 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -181,6 +181,7 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; ; GFX1250-LABEL: local_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +365,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; ; GFX1250-LABEL: local_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -547,6 +549,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; ; GFX1250-LABEL: local_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -730,6 +733,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; ; GFX1250-LABEL: local_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -883,6 +887,7 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; ; GFX1250-LABEL: local_singlethread_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1033,6 +1038,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; ; GFX1250-LABEL: local_singlethread_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1183,6 +1189,7 @@ define amdgpu_kernel void @local_singlethread_release_store( ; ; GFX1250-LABEL: local_singlethread_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1333,6 +1340,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; ; GFX1250-LABEL: local_singlethread_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1483,6 +1491,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1633,6 +1642,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1783,6 +1793,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1933,6 +1944,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2083,6 +2095,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2275,6 +2288,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2471,6 +2485,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2667,6 +2682,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2847,6 +2863,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3026,6 +3043,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3205,6 +3223,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3384,6 +3403,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3563,6 +3583,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3742,6 +3763,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3921,6 +3943,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4100,6 +4123,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4279,6 +4303,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4458,6 +4483,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4637,6 +4663,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4816,6 +4843,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4995,6 +5023,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5174,6 +5203,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5353,6 +5383,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5574,6 +5605,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5800,6 +5832,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6026,6 +6059,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6252,6 +6286,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6478,6 +6513,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6704,6 +6740,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6930,6 +6967,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7156,6 +7194,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7382,6 +7421,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7608,6 +7648,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7834,6 +7875,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8060,6 +8102,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8286,6 +8329,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8512,6 +8556,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8738,6 +8783,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8926,6 +8972,7 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; ; GFX1250-LABEL: local_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9109,6 +9156,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9292,6 +9340,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9475,6 +9524,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9628,6 +9678,7 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; ; GFX1250-LABEL: local_singlethread_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9778,6 +9829,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9928,6 +9980,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; ; GFX1250-LABEL: local_singlethread_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10078,6 +10131,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10228,6 +10282,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10378,6 +10433,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10528,6 +10584,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10678,6 +10735,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10828,6 +10886,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11020,6 +11079,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11216,6 +11276,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11412,6 +11473,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11592,6 +11654,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11771,6 +11834,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11950,6 +12014,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12129,6 +12194,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12308,6 +12374,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12487,6 +12554,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12666,6 +12734,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12845,6 +12914,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13024,6 +13094,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13203,6 +13274,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13382,6 +13454,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13561,6 +13634,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13740,6 +13814,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13919,6 +13994,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14098,6 +14174,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14319,6 +14396,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14545,6 +14623,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14771,6 +14850,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; ; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14997,6 +15077,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15223,6 +15304,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15449,6 +15531,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15675,6 +15758,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15901,6 +15985,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16127,6 +16212,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16353,6 +16439,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16579,6 +16666,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; ; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16805,6 +16893,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17031,6 +17120,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17257,6 +17347,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17483,6 +17574,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 577d2ca9514bb..3c9f00829c9c8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -181,6 +181,7 @@ define amdgpu_kernel void @local_system_unordered_load( ; ; GFX1250-LABEL: local_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +365,7 @@ define amdgpu_kernel void @local_system_monotonic_load( ; ; GFX1250-LABEL: local_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -552,6 +554,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; ; GFX1250-LABEL: local_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -763,6 +766,7 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; ; GFX1250-LABEL: local_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -918,6 +922,7 @@ define amdgpu_kernel void @local_system_unordered_store( ; ; GFX1250-LABEL: local_system_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_system_monotonic_store( ; ; GFX1250-LABEL: local_system_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_system_release_store( ; ; GFX1250-LABEL: local_system_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; ; GFX1250-LABEL: local_system_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; ; GFX1250-LABEL: local_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; ; GFX1250-LABEL: local_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_system_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; ; GFX1250-LABEL: local_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; ; GFX1250-LABEL: local_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; ; GFX1250-LABEL: local_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; ; GFX1250-LABEL: local_system_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; ; GFX1250-LABEL: local_system_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; ; GFX1250-LABEL: local_system_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 33c516c61e42c..f7ac29ef69477 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -145,6 +145,7 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX1250-LABEL: local_volatile_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -316,6 +317,7 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX1250-LABEL: local_volatile_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -452,6 +454,7 @@ define amdgpu_kernel void @local_volatile_store_0( ; ; GFX1250-LABEL: local_volatile_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -597,6 +600,7 @@ define amdgpu_kernel void @local_volatile_store_1( ; ; GFX1250-LABEL: local_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -740,6 +744,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; ; GFX1250-LABEL: local_volatile_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -872,6 +877,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; ; GFX1250-LABEL: local_volatile_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index 7e345ed6e2716..73a557f85b05d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -181,6 +181,7 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; ; GFX1250-LABEL: local_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +365,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; ; GFX1250-LABEL: local_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -547,6 +549,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; ; GFX1250-LABEL: local_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -730,6 +733,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; ; GFX1250-LABEL: local_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -883,6 +887,7 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; ; GFX1250-LABEL: local_wavefront_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1033,6 +1038,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; ; GFX1250-LABEL: local_wavefront_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1183,6 +1189,7 @@ define amdgpu_kernel void @local_wavefront_release_store( ; ; GFX1250-LABEL: local_wavefront_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1333,6 +1340,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; ; GFX1250-LABEL: local_wavefront_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1483,6 +1491,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1633,6 +1642,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1783,6 +1793,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1933,6 +1944,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2083,6 +2095,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2275,6 +2288,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2471,6 +2485,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2667,6 +2682,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2847,6 +2863,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3026,6 +3043,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3205,6 +3223,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3384,6 +3403,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3563,6 +3583,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3742,6 +3763,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3921,6 +3943,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4100,6 +4123,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4279,6 +4303,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4458,6 +4483,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4637,6 +4663,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4816,6 +4843,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4995,6 +5023,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5174,6 +5203,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5353,6 +5383,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5574,6 +5605,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -5800,6 +5832,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6026,6 +6059,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6252,6 +6286,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6478,6 +6513,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6704,6 +6740,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6930,6 +6967,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7156,6 +7194,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7382,6 +7421,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7608,6 +7648,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7834,6 +7875,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8060,6 +8102,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8286,6 +8329,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8512,6 +8556,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8738,6 +8783,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8926,6 +8972,7 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; ; GFX1250-LABEL: local_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9109,6 +9156,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9292,6 +9340,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9475,6 +9524,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9628,6 +9678,7 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; ; GFX1250-LABEL: local_wavefront_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9778,6 +9829,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -9928,6 +9980,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; ; GFX1250-LABEL: local_wavefront_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10078,6 +10131,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10228,6 +10282,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10378,6 +10433,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10528,6 +10584,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10678,6 +10735,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10828,6 +10886,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11020,6 +11079,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11216,6 +11276,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11412,6 +11473,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11592,6 +11654,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11771,6 +11834,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -11950,6 +12014,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12129,6 +12194,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12308,6 +12374,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12487,6 +12554,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12666,6 +12734,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12845,6 +12914,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13024,6 +13094,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13203,6 +13274,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13382,6 +13454,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13561,6 +13634,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13740,6 +13814,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13919,6 +13994,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14098,6 +14174,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14319,6 +14396,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14545,6 +14623,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14771,6 +14850,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -14997,6 +15077,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15223,6 +15304,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15449,6 +15531,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15675,6 +15758,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15901,6 +15985,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16127,6 +16212,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16353,6 +16439,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16579,6 +16666,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16805,6 +16893,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17031,6 +17120,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17257,6 +17347,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17483,6 +17574,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index ab4d7834b23a5..44a965b753198 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -181,6 +181,7 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; ; GFX1250-LABEL: local_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -364,6 +365,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; ; GFX1250-LABEL: local_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -552,6 +554,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; ; GFX1250-LABEL: local_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -763,6 +766,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; ; GFX1250-LABEL: local_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -918,6 +922,7 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; ; GFX1250-LABEL: local_workgroup_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; ; GFX1250-LABEL: local_workgroup_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_workgroup_release_store( ; ; GFX1250-LABEL: local_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; ; GFX1250-LABEL: local_workgroup_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; ; GFX1250-LABEL: local_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; ; GFX1250-LABEL: local_workgroup_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; ; GFX1250-LABEL: local_workgroup_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll index 06a0514404fbc..f520943983b3a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll @@ -179,6 +179,7 @@ define amdgpu_kernel void @private_agent_unordered_load( ; ; GFX1250-LABEL: private_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -378,6 +379,7 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; ; GFX1250-LABEL: private_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -577,6 +579,7 @@ define amdgpu_kernel void @private_agent_acquire_load( ; ; GFX1250-LABEL: private_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -777,6 +780,7 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; ; GFX1250-LABEL: private_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -959,6 +963,7 @@ define amdgpu_kernel void @private_agent_unordered_store( ; ; GFX1250-LABEL: private_agent_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1137,6 +1142,7 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; ; GFX1250-LABEL: private_agent_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1315,6 +1321,7 @@ define amdgpu_kernel void @private_agent_release_store( ; ; GFX1250-LABEL: private_agent_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1496,6 +1503,7 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; ; GFX1250-LABEL: private_agent_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1677,6 +1685,7 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_agent_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1855,6 +1864,7 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; ; GFX1250-LABEL: private_agent_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2035,6 +2045,7 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; ; GFX1250-LABEL: private_agent_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2216,6 +2227,7 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_agent_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2399,6 +2411,7 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_agent_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2636,6 +2649,7 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_agent_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -2872,6 +2886,7 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_agent_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3111,6 +3126,7 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_agent_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3380,6 +3396,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3649,6 +3666,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3920,6 +3938,7 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4192,6 +4211,7 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4466,6 +4486,7 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4740,6 +4761,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5011,6 +5033,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5282,6 +5305,7 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5556,6 +5580,7 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5830,6 +5855,7 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6104,6 +6130,7 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6378,6 +6405,7 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6652,6 +6680,7 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6926,6 +6955,7 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7200,6 +7230,7 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7502,6 +7533,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7803,6 +7835,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8105,6 +8138,7 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8409,6 +8443,7 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8714,6 +8749,7 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9019,6 +9055,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9321,6 +9358,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9623,6 +9661,7 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9928,6 +9967,7 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10233,6 +10273,7 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10538,6 +10579,7 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10843,6 +10885,7 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11148,6 +11191,7 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11453,6 +11497,7 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11758,6 +11803,7 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11971,6 +12017,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load( ; ; GFX1250-LABEL: private_agent_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12170,6 +12217,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12369,6 +12417,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load( ; ; GFX1250-LABEL: private_agent_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12570,6 +12619,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12753,6 +12803,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; ; GFX1250-LABEL: private_agent_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12931,6 +12982,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13109,6 +13161,7 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; ; GFX1250-LABEL: private_agent_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13290,6 +13343,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13471,6 +13525,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13649,6 +13704,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13829,6 +13885,7 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14010,6 +14067,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14193,6 +14251,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14430,6 +14489,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14667,6 +14727,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14907,6 +14968,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -15177,6 +15239,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15446,6 +15509,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15717,6 +15781,7 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15989,6 +16054,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16263,6 +16329,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16537,6 +16604,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16808,6 +16876,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17079,6 +17148,7 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17353,6 +17423,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17627,6 +17698,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17901,6 +17973,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18175,6 +18248,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18449,6 +18523,7 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18723,6 +18798,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18997,6 +19073,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -19299,6 +19376,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19600,6 +19678,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19903,6 +19982,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20209,6 +20289,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20515,6 +20596,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20818,6 +20900,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21121,6 +21204,7 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21427,6 +21511,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21733,6 +21818,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22039,6 +22125,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22345,6 +22432,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22651,6 +22739,7 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22957,6 +23046,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -23263,6 +23353,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll index 8fcaeccbc0397..24f9fe0297c63 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll @@ -179,6 +179,7 @@ define amdgpu_kernel void @private_cluster_unordered_load( ; ; GFX1250-LABEL: private_cluster_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -378,6 +379,7 @@ define amdgpu_kernel void @private_cluster_monotonic_load( ; ; GFX1250-LABEL: private_cluster_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -577,6 +579,7 @@ define amdgpu_kernel void @private_cluster_acquire_load( ; ; GFX1250-LABEL: private_cluster_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -777,6 +780,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_load( ; ; GFX1250-LABEL: private_cluster_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -959,6 +963,7 @@ define amdgpu_kernel void @private_cluster_unordered_store( ; ; GFX1250-LABEL: private_cluster_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1137,6 +1142,7 @@ define amdgpu_kernel void @private_cluster_monotonic_store( ; ; GFX1250-LABEL: private_cluster_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1315,6 +1321,7 @@ define amdgpu_kernel void @private_cluster_release_store( ; ; GFX1250-LABEL: private_cluster_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1495,6 +1502,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_store( ; ; GFX1250-LABEL: private_cluster_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1675,6 +1683,7 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_cluster_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1853,6 +1862,7 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw( ; ; GFX1250-LABEL: private_cluster_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2033,6 +2043,7 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw( ; ; GFX1250-LABEL: private_cluster_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2213,6 +2224,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_cluster_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2395,6 +2407,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_cluster_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2631,6 +2644,7 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_cluster_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -2867,6 +2881,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_cluster_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3105,6 +3120,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_cluster_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3373,6 +3389,7 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3642,6 +3659,7 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3913,6 +3931,7 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4184,6 +4203,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4457,6 +4477,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4730,6 +4751,7 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5001,6 +5023,7 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5272,6 +5295,7 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5545,6 +5569,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5818,6 +5843,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6091,6 +6117,7 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6364,6 +6391,7 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6637,6 +6665,7 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6910,6 +6939,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7183,6 +7213,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7484,6 +7515,7 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7785,6 +7817,7 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8087,6 +8120,7 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8390,6 +8424,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8694,6 +8729,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8998,6 +9034,7 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9300,6 +9337,7 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9602,6 +9640,7 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9906,6 +9945,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10210,6 +10250,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10514,6 +10555,7 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10818,6 +10860,7 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11122,6 +11165,7 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11426,6 +11470,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11730,6 +11775,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11942,6 +11988,7 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load( ; ; GFX1250-LABEL: private_cluster_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12141,6 +12188,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12340,6 +12388,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12541,6 +12590,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12724,6 +12774,7 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store( ; ; GFX1250-LABEL: private_cluster_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12902,6 +12953,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13080,6 +13132,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_store( ; ; GFX1250-LABEL: private_cluster_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13260,6 +13313,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13440,6 +13494,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13618,6 +13673,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13798,6 +13854,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13978,6 +14035,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14160,6 +14218,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14396,6 +14455,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14633,6 +14693,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14872,6 +14933,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -15141,6 +15203,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15410,6 +15473,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15681,6 +15745,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15952,6 +16017,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16225,6 +16291,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16498,6 +16565,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16769,6 +16837,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17040,6 +17109,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17313,6 +17383,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17586,6 +17657,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17859,6 +17931,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18132,6 +18205,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18405,6 +18479,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18678,6 +18753,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18951,6 +19027,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -19252,6 +19329,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19553,6 +19631,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19856,6 +19935,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20161,6 +20241,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20466,6 +20547,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20769,6 +20851,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21072,6 +21155,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21377,6 +21461,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21682,6 +21767,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21987,6 +22073,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22292,6 +22379,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22597,6 +22685,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22902,6 +22991,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -23207,6 +23297,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index 80ea48be0b893..bd84a87c2538f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -17,6 +17,7 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr add ; ; GFX1250-LABEL: private_last_use_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -51,6 +52,7 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; ; GFX1250-LABEL: private_last_use_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -86,6 +88,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) % ; ; GFX1250-LABEL: private_last_use_and_volatile_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -114,6 +117,7 @@ define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5 ; ; GFX1250-LABEL: private_last_use_and_nontemporal_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 6c19722ad6e33..ac16108076d8c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -205,6 +205,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX1250-LABEL: private_nontemporal_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -453,6 +454,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX1250-LABEL: private_nontemporal_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -644,6 +646,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX1250-LABEL: private_nontemporal_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -862,6 +865,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX1250-LABEL: private_nontemporal_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1077,6 +1081,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX1250-LABEL: private_nontemporal_volatile_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll index f9189c8419898..583eaeb38870b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll @@ -179,6 +179,7 @@ define amdgpu_kernel void @private_singlethread_unordered_load( ; ; GFX1250-LABEL: private_singlethread_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -378,6 +379,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_load( ; ; GFX1250-LABEL: private_singlethread_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -577,6 +579,7 @@ define amdgpu_kernel void @private_singlethread_acquire_load( ; ; GFX1250-LABEL: private_singlethread_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -776,6 +779,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load( ; ; GFX1250-LABEL: private_singlethread_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -955,6 +959,7 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; ; GFX1250-LABEL: private_singlethread_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1133,6 +1138,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; ; GFX1250-LABEL: private_singlethread_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1311,6 +1317,7 @@ define amdgpu_kernel void @private_singlethread_release_store( ; ; GFX1250-LABEL: private_singlethread_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1489,6 +1496,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; ; GFX1250-LABEL: private_singlethread_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1667,6 +1675,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1845,6 +1854,7 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2023,6 +2033,7 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2201,6 +2212,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2379,6 +2391,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2611,6 +2624,7 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -2846,6 +2860,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3081,6 +3096,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3346,6 +3362,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3615,6 +3632,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3884,6 +3902,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4153,6 +4172,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4422,6 +4442,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4691,6 +4712,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4960,6 +4982,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5229,6 +5252,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5498,6 +5522,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5767,6 +5792,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6036,6 +6062,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6305,6 +6332,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6574,6 +6602,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6843,6 +6872,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7112,6 +7142,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7409,6 +7440,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7710,6 +7742,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8011,6 +8044,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8312,6 +8346,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8613,6 +8648,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8914,6 +8950,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9215,6 +9252,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9516,6 +9554,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9817,6 +9856,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10118,6 +10158,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10419,6 +10460,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10720,6 +10762,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11021,6 +11064,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11322,6 +11366,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11623,6 +11668,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11832,6 +11878,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load( ; ; GFX1250-LABEL: private_singlethread_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12031,6 +12078,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load( ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12230,6 +12278,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load( ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12429,6 +12478,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load( ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12608,6 +12658,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; ; GFX1250-LABEL: private_singlethread_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12786,6 +12837,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12964,6 +13016,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; ; GFX1250-LABEL: private_singlethread_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13142,6 +13195,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13320,6 +13374,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13498,6 +13553,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13676,6 +13732,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13854,6 +13911,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14032,6 +14090,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14264,6 +14323,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14499,6 +14559,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14734,6 +14795,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14999,6 +15061,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15268,6 +15331,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15537,6 +15601,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; ; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15806,6 +15871,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16075,6 +16141,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16344,6 +16411,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16613,6 +16681,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16882,6 +16951,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17151,6 +17221,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17420,6 +17491,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17689,6 +17761,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17958,6 +18031,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18227,6 +18301,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18496,6 +18571,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18765,6 +18841,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -19062,6 +19139,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19363,6 +19441,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19664,6 +19743,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; ; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19965,6 +20045,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20266,6 +20347,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20567,6 +20649,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20868,6 +20951,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21169,6 +21253,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21470,6 +21555,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21771,6 +21857,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22072,6 +22159,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; ; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22373,6 +22461,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22674,6 +22763,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22975,6 +23065,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -23276,6 +23367,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll index e4708f544d721..1bf9db321fa0c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll @@ -179,6 +179,7 @@ define amdgpu_kernel void @private_system_unordered_load( ; ; GFX1250-LABEL: private_system_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -378,6 +379,7 @@ define amdgpu_kernel void @private_system_monotonic_load( ; ; GFX1250-LABEL: private_system_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -577,6 +579,7 @@ define amdgpu_kernel void @private_system_acquire_load( ; ; GFX1250-LABEL: private_system_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -777,6 +780,7 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; ; GFX1250-LABEL: private_system_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -959,6 +963,7 @@ define amdgpu_kernel void @private_system_unordered_store( ; ; GFX1250-LABEL: private_system_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1137,6 +1142,7 @@ define amdgpu_kernel void @private_system_monotonic_store( ; ; GFX1250-LABEL: private_system_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1315,6 +1321,7 @@ define amdgpu_kernel void @private_system_release_store( ; ; GFX1250-LABEL: private_system_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1496,6 +1503,7 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; ; GFX1250-LABEL: private_system_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1677,6 +1685,7 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_system_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1855,6 +1864,7 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; ; GFX1250-LABEL: private_system_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2035,6 +2045,7 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; ; GFX1250-LABEL: private_system_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2216,6 +2227,7 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_system_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2399,6 +2411,7 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_system_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2636,6 +2649,7 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_system_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -2872,6 +2886,7 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_system_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3111,6 +3126,7 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_system_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3380,6 +3396,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3649,6 +3666,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3920,6 +3938,7 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4192,6 +4211,7 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4466,6 +4486,7 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4740,6 +4761,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5011,6 +5033,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5282,6 +5305,7 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5556,6 +5580,7 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5830,6 +5855,7 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6104,6 +6130,7 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_system_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6406,6 +6433,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -6707,6 +6735,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7009,6 +7038,7 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7314,6 +7344,7 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7619,6 +7650,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7921,6 +7953,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8223,6 +8256,7 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8528,6 +8562,7 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8833,6 +8868,7 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9138,6 +9174,7 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9443,6 +9480,7 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9748,6 +9786,7 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_relese_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10053,6 +10092,7 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10358,6 +10398,7 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10571,6 +10612,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_load( ; ; GFX1250-LABEL: private_system_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -10770,6 +10812,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load( ; ; GFX1250-LABEL: private_system_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -10969,6 +11012,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_load( ; ; GFX1250-LABEL: private_system_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -11170,6 +11214,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -11353,6 +11398,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; ; GFX1250-LABEL: private_system_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -11531,6 +11577,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; ; GFX1250-LABEL: private_system_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -11709,6 +11756,7 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; ; GFX1250-LABEL: private_system_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -11890,6 +11938,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12071,6 +12120,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12249,6 +12299,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12429,6 +12480,7 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12610,6 +12662,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12793,6 +12846,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13030,6 +13084,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -13267,6 +13322,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -13507,6 +13563,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -13777,6 +13834,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14046,6 +14104,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14317,6 +14376,7 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14589,6 +14649,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -14863,6 +14924,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15137,6 +15199,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15408,6 +15471,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15679,6 +15743,7 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15953,6 +16018,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16227,6 +16293,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16501,6 +16568,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16775,6 +16843,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17049,6 +17118,7 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17323,6 +17393,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17597,6 +17668,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17899,6 +17971,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18200,6 +18273,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18503,6 +18577,7 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -18807,6 +18882,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19113,6 +19189,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19419,6 +19496,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19722,6 +19800,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20025,6 +20104,7 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20331,6 +20411,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20637,6 +20718,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20943,6 +21025,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21249,6 +21332,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21555,6 +21639,7 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21861,6 +21946,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22167,6 +22253,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 7c23b76cec3e9..8d717c58c4a7c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -159,6 +159,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX1250-LABEL: private_volatile_load_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 @@ -351,6 +352,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX1250-LABEL: private_volatile_load_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 @@ -515,6 +517,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX1250-LABEL: private_volatile_store_0: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -697,6 +700,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX1250-LABEL: private_volatile_store_1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll index d4c562a149b9a..a9c163a87df61 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll @@ -179,6 +179,7 @@ define amdgpu_kernel void @private_wavefront_unordered_load( ; ; GFX1250-LABEL: private_wavefront_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -378,6 +379,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_load( ; ; GFX1250-LABEL: private_wavefront_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -577,6 +579,7 @@ define amdgpu_kernel void @private_wavefront_acquire_load( ; ; GFX1250-LABEL: private_wavefront_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -776,6 +779,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load( ; ; GFX1250-LABEL: private_wavefront_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -955,6 +959,7 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; ; GFX1250-LABEL: private_wavefront_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1133,6 +1138,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; ; GFX1250-LABEL: private_wavefront_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1311,6 +1317,7 @@ define amdgpu_kernel void @private_wavefront_release_store( ; ; GFX1250-LABEL: private_wavefront_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1489,6 +1496,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; ; GFX1250-LABEL: private_wavefront_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1667,6 +1675,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1845,6 +1854,7 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2023,6 +2033,7 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2201,6 +2212,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2379,6 +2391,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2611,6 +2624,7 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -2846,6 +2860,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3081,6 +3096,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3346,6 +3362,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3615,6 +3632,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3884,6 +3902,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4153,6 +4172,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4422,6 +4442,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4691,6 +4712,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4960,6 +4982,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5229,6 +5252,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5498,6 +5522,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5767,6 +5792,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6036,6 +6062,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6305,6 +6332,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6574,6 +6602,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6843,6 +6872,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7112,6 +7142,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7409,6 +7440,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7710,6 +7742,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8011,6 +8044,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8312,6 +8346,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8613,6 +8648,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8914,6 +8950,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9215,6 +9252,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9516,6 +9554,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9817,6 +9856,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10118,6 +10158,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10419,6 +10460,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10720,6 +10762,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11021,6 +11064,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11322,6 +11366,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11623,6 +11668,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11832,6 +11878,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load( ; ; GFX1250-LABEL: private_wavefront_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12031,6 +12078,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load( ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12230,6 +12278,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12429,6 +12478,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12608,6 +12658,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; ; GFX1250-LABEL: private_wavefront_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12786,6 +12837,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12964,6 +13016,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; ; GFX1250-LABEL: private_wavefront_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13142,6 +13195,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13320,6 +13374,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13498,6 +13553,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13676,6 +13732,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13854,6 +13911,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14032,6 +14090,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14264,6 +14323,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14499,6 +14559,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14734,6 +14795,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14999,6 +15061,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15268,6 +15331,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15537,6 +15601,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15806,6 +15871,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16075,6 +16141,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16344,6 +16411,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16613,6 +16681,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16882,6 +16951,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17151,6 +17221,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17420,6 +17491,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17689,6 +17761,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17958,6 +18031,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18227,6 +18301,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18496,6 +18571,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18765,6 +18841,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -19062,6 +19139,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19363,6 +19441,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19664,6 +19743,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19965,6 +20045,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20266,6 +20347,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20567,6 +20649,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20868,6 +20951,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21169,6 +21253,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21470,6 +21555,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21771,6 +21857,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22072,6 +22159,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22373,6 +22461,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22674,6 +22763,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22975,6 +23065,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -23276,6 +23367,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll index 53a8a0a1b694e..7561a60518bf8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll @@ -179,6 +179,7 @@ define amdgpu_kernel void @private_workgroup_unordered_load( ; ; GFX1250-LABEL: private_workgroup_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -378,6 +379,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_load( ; ; GFX1250-LABEL: private_workgroup_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -577,6 +579,7 @@ define amdgpu_kernel void @private_workgroup_acquire_load( ; ; GFX1250-LABEL: private_workgroup_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -776,6 +779,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; ; GFX1250-LABEL: private_workgroup_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -957,6 +961,7 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; ; GFX1250-LABEL: private_workgroup_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1135,6 +1140,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; ; GFX1250-LABEL: private_workgroup_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1313,6 +1319,7 @@ define amdgpu_kernel void @private_workgroup_release_store( ; ; GFX1250-LABEL: private_workgroup_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1493,6 +1500,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; ; GFX1250-LABEL: private_workgroup_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1673,6 +1681,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -1851,6 +1860,7 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2030,6 +2040,7 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2210,6 +2221,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2391,6 +2403,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -2626,6 +2639,7 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -2861,6 +2875,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3098,6 +3113,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -3365,6 +3381,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3634,6 +3651,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -3904,6 +3922,7 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4175,6 +4194,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4447,6 +4467,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4719,6 +4740,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -4989,6 +5011,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5259,6 +5282,7 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5531,6 +5555,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -5803,6 +5828,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6075,6 +6101,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6347,6 +6374,7 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6619,6 +6647,7 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -6891,6 +6920,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7163,6 +7193,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -7463,6 +7494,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -7764,6 +7796,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8065,6 +8098,7 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8368,6 +8402,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8671,6 +8706,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -8974,6 +9010,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9275,6 +9312,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9576,6 +9614,7 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -9879,6 +9918,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10182,6 +10222,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10485,6 +10526,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -10788,6 +10830,7 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11091,6 +11134,7 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11394,6 +11438,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11697,6 +11742,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -11908,6 +11954,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load( ; ; GFX1250-LABEL: private_workgroup_one_as_unordered_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12107,6 +12154,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load( ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12306,6 +12354,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12505,6 +12554,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12686,6 +12736,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; ; GFX1250-LABEL: private_workgroup_one_as_unordered_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -12864,6 +12915,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13042,6 +13094,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; ; GFX1250-LABEL: private_workgroup_one_as_release_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13222,6 +13275,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13402,6 +13456,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13580,6 +13635,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13759,6 +13815,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_release_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -13939,6 +13996,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14120,6 +14178,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s1, 0 @@ -14355,6 +14414,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14590,6 +14650,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -14827,6 +14888,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_mov_b32 s2, 0 @@ -15094,6 +15156,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15363,6 +15426,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15633,6 +15697,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -15904,6 +15969,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16176,6 +16242,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16448,6 +16515,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16718,6 +16786,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -16988,6 +17057,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_release_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17260,6 +17330,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17532,6 +17603,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -17804,6 +17876,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18076,6 +18149,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18348,6 +18422,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18620,6 +18695,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -18892,6 +18968,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 @@ -19192,6 +19269,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19493,6 +19571,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -19794,6 +19873,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20097,6 +20177,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20400,6 +20481,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -20703,6 +20785,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21004,6 +21087,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21305,6 +21389,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21608,6 +21693,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -21911,6 +21997,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22214,6 +22301,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22517,6 +22605,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -22820,6 +22909,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -23123,6 +23213,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 @@ -23426,6 +23517,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 ; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 0458a64991028..eff0680fe9a31 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -129,6 +129,7 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_sle_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -226,6 +227,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_imin_sle_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -313,6 +315,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX1250-LABEL: s_test_imin_sle_v1i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -439,6 +442,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; ; GFX1250-LABEL: s_test_imin_sle_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -574,6 +578,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX1250-LABEL: s_test_imin_sle_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x28 @@ -829,6 +834,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX1250-LABEL: s_test_imin_sle_v4i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x28 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x4c @@ -974,6 +980,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX1250-LABEL: s_test_imin_sle_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1170,6 +1177,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX1250-LABEL: s_test_imin_sle_v4i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 @@ -1305,6 +1313,7 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_slt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -1482,6 +1491,7 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-TRUE16-LABEL: v_test_imin_slt_i16: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: s_clause 0x1 ; GFX1250-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -1497,6 +1507,7 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-FAKE16-LABEL: v_test_imin_slt_i16: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: s_clause 0x1 ; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -1595,6 +1606,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_imin_slt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1699,6 +1711,7 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX1250-LABEL: s_test_imin_slt_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 @@ -1796,6 +1809,7 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX1250-LABEL: s_test_imin_slt_imm_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1890,6 +1904,7 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; ; GFX1250-LABEL: s_test_imin_sle_imm_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2024,6 +2039,7 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umin_ule_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -2184,6 +2200,7 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_umin_ule_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -2387,6 +2404,7 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_umin_ule_v3i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -2490,6 +2508,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_umin_ule_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2624,6 +2643,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umin_ult_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -2784,6 +2804,7 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-TRUE16-LABEL: v_test_umin_ult_i8: ; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-TRUE16-NEXT: s_clause 0x1 ; GFX1250-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -2799,6 +2820,7 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-FAKE16-LABEL: v_test_umin_ult_i8: ; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-FAKE16-NEXT: s_clause 0x1 ; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -2897,6 +2919,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX1250-LABEL: s_test_umin_ult_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3051,6 +3074,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX1250-LABEL: v_test_umin_ult_i32_multi_use: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3247,6 +3271,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX1250-LABEL: v_test_umin_ult_i16_multi_use: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3347,6 +3372,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX1250-LABEL: s_test_umin_ult_v1i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -3536,6 +3562,7 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX1250-LABEL: s_test_umin_ult_v8i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b512 s[8:23], s[4:5], 0x20 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -3852,6 +3879,7 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; ; GFX1250-LABEL: s_test_umin_ult_v8i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 @@ -3976,6 +4004,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; ; GFX1250-LABEL: simplify_demanded_bits_test_umin_ult_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x28 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x4c @@ -4105,6 +4134,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; ; GFX1250-LABEL: simplify_demanded_bits_test_min_slt_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x28 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x4c @@ -4237,6 +4267,7 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; ; GFX1250-LABEL: s_test_imin_sle_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -4365,6 +4396,7 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umin_ult_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -4489,6 +4521,7 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umin_ule_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -4613,6 +4646,7 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imin_slt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -4737,6 +4771,7 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imin_sle_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -4899,6 +4934,7 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_imin_sle_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 @@ -5069,6 +5105,7 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_imin_ule_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll index 60570bdbfc23b..dacee9a0173c5 100644 --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -41,25 +41,67 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-LABEL: s_test_minmax_i32: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_max_i32 s0, s0, s1 -; SDAG-NEXT: s_mov_b32 s5, s4 -; SDAG-NEXT: s_min_i32 s0, s0, s2 -; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; SDAG-NEXT: s_mov_b32 s4, s3 -; SDAG-NEXT: global_store_b32 v0, v1, s[4:5] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: s_test_minmax_i32: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_max_i32 s0, s0, s1 -; GISEL-NEXT: s_mov_b32 s6, s3 -; GISEL-NEXT: s_min_i32 s0, s0, s2 -; GISEL-NEXT: s_mov_b32 s7, s4 -; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 -; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_endpgm +; SDAG-GFX11-LABEL: s_test_minmax_i32: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_max_i32 s0, s0, s1 +; SDAG-GFX11-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-NEXT: s_min_i32 s0, s0, s2 +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX11-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-GFX11-NEXT: s_endpgm +; +; GISEL-GFX11-LABEL: s_test_minmax_i32: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_max_i32 s0, s0, s1 +; GISEL-GFX11-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-NEXT: s_min_i32 s0, s0, s2 +; GISEL-GFX11-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX11-NEXT: s_endpgm +; +; SDAG-GFX12-LABEL: s_test_minmax_i32: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_max_i32 s0, s0, s1 +; SDAG-GFX12-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-NEXT: s_min_i32 s0, s0, s2 +; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX12-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-GFX12-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: s_test_minmax_i32: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_max_i32 s0, s0, s1 +; GISEL-GFX12-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX12-NEXT: s_min_i32 s0, s0, s2 +; GISEL-GFX12-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX12-NEXT: s_endpgm +; +; SDAG-GFX1250-LABEL: s_test_minmax_i32: +; SDAG-GFX1250: ; %bb.0: +; SDAG-GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; SDAG-GFX1250-NEXT: s_max_i32 s0, s0, s1 +; SDAG-GFX1250-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX1250-NEXT: s_min_i32 s0, s0, s2 +; SDAG-GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX1250-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX1250-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-GFX1250-NEXT: s_endpgm +; +; GISEL-GFX1250-LABEL: s_test_minmax_i32: +; GISEL-GFX1250: ; %bb.0: +; GISEL-GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GISEL-GFX1250-NEXT: s_max_i32 s0, s0, s1 +; GISEL-GFX1250-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX1250-NEXT: s_min_i32 s0, s0, s2 +; GISEL-GFX1250-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX1250-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX1250-NEXT: s_endpgm %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c) store i32 %sminmax, ptr addrspace(1) %out @@ -213,25 +255,67 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) { } define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-LABEL: s_test_minmax_u32: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_max_u32 s0, s0, s1 -; SDAG-NEXT: s_mov_b32 s5, s4 -; SDAG-NEXT: s_min_u32 s0, s0, s2 -; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; SDAG-NEXT: s_mov_b32 s4, s3 -; SDAG-NEXT: global_store_b32 v0, v1, s[4:5] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: s_test_minmax_u32: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_max_u32 s0, s0, s1 -; GISEL-NEXT: s_mov_b32 s6, s3 -; GISEL-NEXT: s_min_u32 s0, s0, s2 -; GISEL-NEXT: s_mov_b32 s7, s4 -; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 -; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] -; GISEL-NEXT: s_endpgm +; SDAG-GFX11-LABEL: s_test_minmax_u32: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_max_u32 s0, s0, s1 +; SDAG-GFX11-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX11-NEXT: s_min_u32 s0, s0, s2 +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX11-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-GFX11-NEXT: s_endpgm +; +; GISEL-GFX11-LABEL: s_test_minmax_u32: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_max_u32 s0, s0, s1 +; GISEL-GFX11-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX11-NEXT: s_min_u32 s0, s0, s2 +; GISEL-GFX11-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX11-NEXT: s_endpgm +; +; SDAG-GFX12-LABEL: s_test_minmax_u32: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_max_u32 s0, s0, s1 +; SDAG-GFX12-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX12-NEXT: s_min_u32 s0, s0, s2 +; SDAG-GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX12-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX12-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-GFX12-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: s_test_minmax_u32: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_max_u32 s0, s0, s1 +; GISEL-GFX12-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX12-NEXT: s_min_u32 s0, s0, s2 +; GISEL-GFX12-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX12-NEXT: s_endpgm +; +; SDAG-GFX1250-LABEL: s_test_minmax_u32: +; SDAG-GFX1250: ; %bb.0: +; SDAG-GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; SDAG-GFX1250-NEXT: s_max_u32 s0, s0, s1 +; SDAG-GFX1250-NEXT: s_mov_b32 s5, s4 +; SDAG-GFX1250-NEXT: s_min_u32 s0, s0, s2 +; SDAG-GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX1250-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX1250-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-GFX1250-NEXT: s_endpgm +; +; GISEL-GFX1250-LABEL: s_test_minmax_u32: +; GISEL-GFX1250: ; %bb.0: +; GISEL-GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GISEL-GFX1250-NEXT: s_max_u32 s0, s0, s1 +; GISEL-GFX1250-NEXT: s_mov_b32 s6, s3 +; GISEL-GFX1250-NEXT: s_min_u32 s0, s0, s2 +; GISEL-GFX1250-NEXT: s_mov_b32 s7, s4 +; GISEL-GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 +; GISEL-GFX1250-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-GFX1250-NEXT: s_endpgm %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %sminmax = call i32 @llvm.umin.i32(i32 %smax, i32 %c) store i32 %sminmax, ptr addrspace(1) %out @@ -460,6 +544,7 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg ; ; SDAG-GFX1250-LABEL: s_test_minmax_f32_ieee_false: ; SDAG-GFX1250: ; %bb.0: +; SDAG-GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; SDAG-GFX1250-NEXT: s_mov_b32 s5, s4 ; SDAG-GFX1250-NEXT: s_mov_b32 s4, s3 @@ -469,6 +554,7 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg ; ; GISEL-GFX1250-LABEL: s_test_minmax_f32_ieee_false: ; GISEL-GFX1250: ; %bb.0: +; GISEL-GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-GFX1250-NEXT: s_max_num_f32 s0, s0, s1 ; GISEL-GFX1250-NEXT: s_mov_b32 s6, s3 ; GISEL-GFX1250-NEXT: s_mov_b32 s7, s4 @@ -496,6 +582,7 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b, ; ; GFX1250-LABEL: test_minmax_commuted_f32_ieee_false: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_maxmin_num_f32 v0, v0, v1, v2 ; GFX1250-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) @@ -579,6 +666,7 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b, ; ; GFX1250-LABEL: test_maxmin_commuted_f32_ieee_false: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_minmax_num_f32 v0, v0, v1, v2 ; GFX1250-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) @@ -697,21 +785,25 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) { ; ; SDAG-GFX1250-TRUE16-LABEL: test_minmax_f16_ieee_false: ; SDAG-GFX1250-TRUE16: ; %bb.0: +; SDAG-GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; SDAG-GFX1250-FAKE16-LABEL: test_minmax_f16_ieee_false: ; SDAG-GFX1250-FAKE16: ; %bb.0: +; SDAG-GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; SDAG-GFX1250-FAKE16-NEXT: ; return to shader part epilog ; ; GISEL-GFX1250-TRUE16-LABEL: test_minmax_f16_ieee_false: ; GISEL-GFX1250-TRUE16: ; %bb.0: +; GISEL-GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l ; GISEL-GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GISEL-GFX1250-FAKE16-LABEL: test_minmax_f16_ieee_false: ; GISEL-GFX1250-FAKE16: ; %bb.0: +; GISEL-GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2 ; GISEL-GFX1250-FAKE16-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) @@ -801,6 +893,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b ; ; SDAG-GFX1250-TRUE16-LABEL: s_test_minmax_f16_ieee_false: ; SDAG-GFX1250-TRUE16: ; %bb.0: +; SDAG-GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; SDAG-GFX1250-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX1250-TRUE16-NEXT: s_mov_b32 s5, s4 @@ -811,6 +904,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b ; ; SDAG-GFX1250-FAKE16-LABEL: s_test_minmax_f16_ieee_false: ; SDAG-GFX1250-FAKE16: ; %bb.0: +; SDAG-GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-GFX1250-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; SDAG-GFX1250-FAKE16-NEXT: s_mov_b32 s5, s4 ; SDAG-GFX1250-FAKE16-NEXT: s_mov_b32 s4, s3 @@ -820,6 +914,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b ; ; GISEL-GFX1250-TRUE16-LABEL: s_test_minmax_f16_ieee_false: ; GISEL-GFX1250-TRUE16: ; %bb.0: +; GISEL-GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-GFX1250-TRUE16-NEXT: s_max_num_f16 s0, s0, s1 ; GISEL-GFX1250-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX1250-TRUE16-NEXT: s_mov_b32 s6, s3 @@ -831,6 +926,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b ; ; GISEL-GFX1250-FAKE16-LABEL: s_test_minmax_f16_ieee_false: ; GISEL-GFX1250-FAKE16: ; %bb.0: +; GISEL-GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-GFX1250-FAKE16-NEXT: s_max_num_f16 s0, s0, s1 ; GISEL-GFX1250-FAKE16-NEXT: s_mov_b32 s6, s3 ; GISEL-GFX1250-FAKE16-NEXT: s_mov_b32 s7, s4 @@ -1021,21 +1117,25 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) { ; ; SDAG-GFX1250-TRUE16-LABEL: test_maxmin_f16_ieee_false: ; SDAG-GFX1250-TRUE16: ; %bb.0: +; SDAG-GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l ; SDAG-GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; SDAG-GFX1250-FAKE16-LABEL: test_maxmin_f16_ieee_false: ; SDAG-GFX1250-FAKE16: ; %bb.0: +; SDAG-GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-GFX1250-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; SDAG-GFX1250-FAKE16-NEXT: ; return to shader part epilog ; ; GISEL-GFX1250-TRUE16-LABEL: test_maxmin_f16_ieee_false: ; GISEL-GFX1250-TRUE16: ; %bb.0: +; GISEL-GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l ; GISEL-GFX1250-TRUE16-NEXT: ; return to shader part epilog ; ; GISEL-GFX1250-FAKE16-LABEL: test_maxmin_f16_ieee_false: ; GISEL-GFX1250-FAKE16: ; %bb.0: +; GISEL-GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-GFX1250-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2 ; GISEL-GFX1250-FAKE16-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) @@ -1297,3 +1397,6 @@ declare half @llvm.maxnum.f16(half, half) declare float @llvm.minnum.f32(float, float) declare float @llvm.maxnum.f32(float, float) attributes #0 = { nounwind "no-nans-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 4681d589ac217..f1130100725c8 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -127,6 +127,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX1250-LABEL: test_mul_v2i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s6, -1 ; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 @@ -308,6 +309,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; ; GFX1250-LABEL: v_mul_v4i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s6, -1 ; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 @@ -447,6 +449,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; ; GFX1250-LABEL: s_trunc_i64_mul_to_i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x34 @@ -614,6 +617,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; ; GFX1250-LABEL: v_trunc_i64_mul_to_i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 @@ -752,6 +756,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX1250-LABEL: mul64_sext_c: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_ashr_i32 s3, s2, 31 @@ -868,6 +873,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX1250-LABEL: mul64_zext_c: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s3, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1016,6 +1022,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX1250-LABEL: v_mul64_sext_c: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s6, -1 ; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 @@ -1179,6 +1186,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; ; GFX1250-LABEL: v_mul64_zext_c: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s6, -1 ; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 @@ -1338,6 +1346,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; ; GFX1250-LABEL: v_mul64_sext_inline_imm: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s6, -1 ; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 @@ -1466,6 +1475,7 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; ; GFX1250-LABEL: s_mul_i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70 @@ -1605,6 +1615,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; GFX1250-LABEL: v_mul_i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s6, -1 ; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 @@ -1738,6 +1749,7 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; ; GFX1250-LABEL: s_mul_i1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4c ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x70 @@ -1913,6 +1925,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; GFX1250-LABEL: v_mul_i1: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_mov_b32 s6, -1 ; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 @@ -2092,6 +2105,7 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX1250-LABEL: s_mul_i64: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -2293,6 +2307,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; ; GFX1250-LABEL: v_mul_i64: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 @@ -2558,6 +2573,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1250-LABEL: mul32_in_branch: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX1250-NEXT: s_mov_b32 s6, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2846,6 +2862,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1250-LABEL: mul64_in_branch: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_cmp_lg_u64 s[12:13], 0 @@ -3216,6 +3233,7 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX1250-LABEL: s_mul_i128: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x2 ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c ; GFX1250-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c @@ -3537,6 +3555,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; ; GFX1250-LABEL: v_mul_i128: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX1250-NEXT: v_and_b32_e32 v16, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 6b45d31da0e95..1ed024f7aed36 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -34,6 +34,7 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { ; ; GFX1250-LABEL: fadd_v2_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -77,6 +78,7 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; ; GFX1250-LABEL: fadd_v2_vs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -141,6 +143,7 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v4_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 @@ -158,6 +161,7 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; ; GFX1250-GISEL-LABEL: fadd_v4_vs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 @@ -343,6 +347,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v32_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 @@ -409,6 +414,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-GISEL-LABEL: fadd_v32_vs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_clause 0x1 @@ -527,6 +533,7 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_imm: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 @@ -539,6 +546,7 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_imm: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 @@ -599,6 +607,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_v_splat: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -610,6 +619,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_v_splat: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -672,6 +682,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_splat: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -683,6 +694,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0 @@ -731,6 +743,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x3f800000 @@ -743,6 +756,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000 @@ -791,6 +805,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_lo0: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x3f80000000000000 @@ -803,6 +818,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_lo0: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f80000000000000 @@ -851,6 +867,7 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_unfoldable_lit: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x400000003f800000 @@ -863,6 +880,7 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_unfoldable_lit: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x400000003f800000 @@ -927,6 +945,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -938,6 +957,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1004,6 +1024,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -1015,6 +1036,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1081,6 +1103,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { ; ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -1092,6 +1115,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { ; ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1155,6 +1179,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo ; ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -1168,6 +1193,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo ; ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1231,6 +1257,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo ; ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -1244,6 +1271,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo ; ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1293,6 +1321,7 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { ; ; GFX1250-LABEL: fmul_v2_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1336,6 +1365,7 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; ; GFX1250-LABEL: fmul_v2_vs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -1400,6 +1430,7 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v4_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 @@ -1417,6 +1448,7 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; ; GFX1250-GISEL-LABEL: fmul_v4_vs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 @@ -1602,6 +1634,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v32_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 @@ -1668,6 +1701,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-GISEL-LABEL: fmul_v32_vs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_clause 0x1 @@ -1785,6 +1819,7 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fmul_v2_v_imm: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000 @@ -1797,6 +1832,7 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fmul_v2_v_imm: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 @@ -1857,6 +1893,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fmul_v2_v_v_splat: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -1868,6 +1905,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fmul_v2_v_v_splat: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1930,6 +1968,7 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fmul_v2_v_lit_splat: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -1941,6 +1980,7 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0 @@ -1990,6 +2030,7 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fmul_v2_v_unfoldable_lit: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4040000040800000 @@ -2002,6 +2043,7 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fmul_v2_v_unfoldable_lit: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x4040000040800000 @@ -2065,6 +2107,7 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; ; GFX1250-SDAG-LABEL: fmul_v2_v_fneg: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -2076,6 +2119,7 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; ; GFX1250-GISEL-LABEL: fmul_v2_v_fneg: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -2125,6 +2169,7 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { ; ; GFX1250-LABEL: fma_v2_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2168,6 +2213,7 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { ; ; GFX1250-LABEL: fma_v2_vs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -2232,6 +2278,7 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; ; GFX1250-SDAG-LABEL: fma_v4_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 @@ -2249,6 +2296,7 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { ; ; GFX1250-GISEL-LABEL: fma_v4_vs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 @@ -2434,6 +2482,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fma_v32_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -2499,6 +2548,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-GISEL-LABEL: fma_v32_vs: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_clause 0x1 @@ -2639,6 +2689,7 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fma_v2_v_imm: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x43480000 @@ -2653,6 +2704,7 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fma_v2_v_imm: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000 @@ -2716,6 +2768,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fma_v2_v_v_splat: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -2727,6 +2780,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fma_v2_v_v_splat: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2809,6 +2863,7 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fma_v2_v_lit_splat: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -2820,6 +2875,7 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0 @@ -2909,6 +2965,7 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4040000040800000 @@ -2922,6 +2979,7 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x4040000040800000 @@ -2987,6 +3045,7 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; ; GFX1250-SDAG-LABEL: fma_v2_v_fneg: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -2998,6 +3057,7 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; ; GFX1250-GISEL-LABEL: fma_v2_v_fneg: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -3068,6 +3128,7 @@ define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %ou ; ; GFX1250-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 @@ -3081,6 +3142,7 @@ define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %ou ; ; GFX1250-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 @@ -3161,6 +3223,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; ; GFX1250-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v2, s2 @@ -3176,6 +3239,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; ; GFX1250-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s3 @@ -3255,6 +3319,7 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1250-SDAG-LABEL: shuffle_add_f32: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2 @@ -3267,6 +3332,7 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1250-GISEL-LABEL: shuffle_add_f32: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -3343,6 +3409,7 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-SDAG-LABEL: shuffle_neg_add_f32: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2 @@ -3358,6 +3425,7 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-GISEL-LABEL: shuffle_neg_add_f32: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 @@ -3433,6 +3501,7 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; ; GFX1250-SDAG-LABEL: fadd_fadd_fsub_0: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0 @@ -3444,6 +3513,7 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; ; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] @@ -3533,6 +3603,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; ; GFX1250-SDAG-LABEL: fadd_fadd_fsub: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -3550,6 +3621,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; ; GFX1250-GISEL-LABEL: fadd_fadd_fsub: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 @@ -3621,6 +3693,7 @@ define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) { ; ; GFX1250-SDAG-LABEL: fadd_shuffle_v4: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -3633,6 +3706,7 @@ define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) { ; ; GFX1250-GISEL-LABEL: fadd_shuffle_v4: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -3693,6 +3767,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { ; ; GFX1250-SDAG-LABEL: fneg_v2f32_vec: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -3704,6 +3779,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { ; ; GFX1250-GISEL-LABEL: fneg_v2f32_vec: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -3756,6 +3832,7 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x ; ; GFX1250-SDAG-LABEL: fneg_v2f32_scalar: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000 @@ -3767,6 +3844,7 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x ; ; GFX1250-GISEL-LABEL: fneg_v2f32_scalar: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 6671201ca2b94..6f222aa7d6977 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -34,6 +34,7 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 ; ; GFX1250-LABEL: preload_block_count_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -75,6 +76,7 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr ; ; GFX1250-LABEL: preload_unused_arg_block_count_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -117,6 +119,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: no_free_sgprs_block_count_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s18 ; GFX1250-NEXT: global_store_b32 v0, v1, s[8:9] ; GFX1250-NEXT: s_endpgm @@ -149,6 +152,7 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 { ; ; GFX1250-LABEL: no_inreg_block_count_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 @@ -186,6 +190,7 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 ; ; GFX1250-LABEL: mixed_inreg_block_count_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b32 s4, s[0:1], 0x10 ; GFX1250-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 @@ -232,6 +237,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr ; ; GFX1250-LABEL: incorrect_type_i64_block_count_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -277,6 +283,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; ; GFX1250-LABEL: incorrect_type_i16_block_count_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] offset:8 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -318,6 +325,7 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 ; ; GFX1250-LABEL: preload_block_count_y: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -361,6 +369,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; ; GFX1250-LABEL: random_incorrect_offset: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0xa ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -405,6 +414,7 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 ; ; GFX1250-LABEL: preload_block_count_z: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -451,6 +461,7 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa ; ; GFX1250-LABEL: preload_block_count_x_imparg_align_ptr_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s4, 0xff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_add_co_i32 s0, s6, s0 @@ -501,6 +512,7 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) ; ; GFX1250-LABEL: preload_block_count_xyz: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4 ; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3] @@ -552,6 +564,7 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) ; ; GFX1250-LABEL: preload_workgroup_size_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s7, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -598,6 +611,7 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) ; ; GFX1250-LABEL: preload_workgroup_size_y: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s7, 16 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -646,6 +660,7 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) ; ; GFX1250-LABEL: preload_workgroup_size_z: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s8, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -702,6 +717,7 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou ; ; GFX1250-LABEL: preload_workgroup_size_xyz: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s7, 16 ; GFX1250-NEXT: s_and_b32 s1, s7, 0xffff ; GFX1250-NEXT: s_and_b32 s4, s8, 0xffff @@ -761,6 +777,7 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { ; ; GFX1250-LABEL: preload_remainder_x: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s8, 16 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -807,6 +824,7 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { ; ; GFX1250-LABEL: preloadremainder_y: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s9, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -853,6 +871,7 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { ; ; GFX1250-LABEL: preloadremainder_z: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s9, 16 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -907,6 +926,7 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 ; ; GFX1250-LABEL: preloadremainder_xyz: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s9, 16 ; GFX1250-NEXT: s_lshr_b32 s1, s8, 16 ; GFX1250-NEXT: s_and_b32 s4, s9, 0xffff @@ -964,6 +984,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; ; GFX1250-LABEL: no_free_sgprs_preloadremainder_z: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s15, 16 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -1012,6 +1033,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg % ; ; GFX1250-LABEL: preload_block_max_user_sgprs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s12 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -1060,6 +1082,7 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt ; ; GFX1250-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s9, 16 ; GFX1250-NEXT: s_and_b32 s1, s8, 0xffff ; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index c1764c94ea2de..c3e283e8a01bf 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -36,6 +36,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) ; ; GFX1250-LABEL: ptr1_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s4, 0xff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -79,6 +80,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero ; ; GFX1250-LABEL: ptr1_i8_zext_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s4, 0xff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -122,6 +124,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 ; ; GFX1250-LABEL: ptr1_i16_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s4, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -163,6 +166,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 ; ; GFX1250-LABEL: ptr1_i32_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -205,6 +209,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa ; ; GFX1250-LABEL: i32_ptr1_i32_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_co_i32 s0, s2, s6 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -252,6 +257,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX1250-LABEL: ptr1_i16_i16_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_lshr_b32 s0, s4, 16 ; GFX1250-NEXT: s_and_b32 s1, s4, 0xffff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -297,6 +303,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 ; ; GFX1250-LABEL: ptr1_v2i8_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -346,6 +353,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; ; GFX1250-LABEL: byref_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 @@ -404,6 +412,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: byref_staggered_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 @@ -471,6 +480,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x ; ; GFX1250-LABEL: v8i32_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s14 ; GFX1250-NEXT: v_dual_mov_b32 v1, s15 :: v_dual_mov_b32 v2, s16 ; GFX1250-NEXT: v_dual_mov_b32 v3, s17 :: v_dual_mov_b32 v4, s10 @@ -518,6 +528,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX1250-LABEL: v3i16_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5 ; GFX1250-NEXT: v_mov_b32_e32 v2, s4 ; GFX1250-NEXT: s_clause 0x1 @@ -563,6 +574,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX1250-LABEL: v3i32_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1250-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0 ; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3] @@ -606,6 +618,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX1250-LABEL: v3f32_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6 ; GFX1250-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 ; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3] @@ -662,6 +675,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou ; ; GFX1250-LABEL: v5i8_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_pack_lh_b32_b16 s0, 0, s4 ; GFX1250-NEXT: s_and_b32 s1, s4, 0xffff ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5 @@ -733,6 +747,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; ; GFX1250-LABEL: v5f64_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b64 s[12:13], s[0:1], 0x60 ; GFX1250-NEXT: s_load_b256 s[4:11], s[0:1], 0x40 @@ -812,6 +827,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8 ; ; GFX1250-LABEL: v8i8_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_pack_lh_b32_b16 s0, 0, s5 ; GFX1250-NEXT: s_pack_lh_b32_b16 s1, 0, s4 ; GFX1250-NEXT: s_and_b32 s4, s4, 0xffff @@ -857,6 +873,7 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i ; ; GFX1250-LABEL: i64_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -895,6 +912,7 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d ; ; GFX1250-LABEL: f64_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -934,6 +952,7 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX1250-LABEL: half_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -972,6 +991,7 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out ; ; GFX1250-LABEL: bfloat_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -1010,6 +1030,7 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: v2bfloat_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm @@ -1051,6 +1072,7 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: v3bfloat_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5 ; GFX1250-NEXT: v_mov_b32_e32 v2, s4 ; GFX1250-NEXT: s_clause 0x1 @@ -1096,6 +1118,7 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: v6bfloat_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1250-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0 ; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3] @@ -1148,6 +1171,7 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr ; ; GFX1250-LABEL: half_v7bfloat_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v2, s8 ; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1194,6 +1218,7 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 ; ; GFX1250-LABEL: i1_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s4, 1 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 @@ -1240,6 +1265,7 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX1250-LABEL: fp128_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s6 ; GFX1250-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 ; GFX1250-NEXT: v_mov_b32_e32 v3, s9 @@ -1299,6 +1325,7 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX1250-LABEL: v7i8_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_pack_lh_b32_b16 s0, 0, s4 ; GFX1250-NEXT: s_and_b32 s1, s4, 0xffff ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5 @@ -1353,6 +1380,7 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out ; ; GFX1250-LABEL: v7half_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s9 ; GFX1250-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v0, s6 ; GFX1250-NEXT: v_mov_b32_e32 v1, s7 @@ -1399,6 +1427,7 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou ; ; GFX1250-LABEL: i16_i32_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: v_mov_b32_e32 v2, s5 ; GFX1250-NEXT: s_clause 0x1 @@ -1450,6 +1479,7 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg % ; ; GFX1250-LABEL: i16_v3i32_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4 ; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1250-NEXT: v_mov_b32_e32 v2, s8 @@ -1495,6 +1525,7 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou ; ; GFX1250-LABEL: i16_i16_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] @@ -1548,6 +1579,7 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: i16_v2i8_kernel_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] @@ -1597,6 +1629,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p ; ; GFX1250-LABEL: i32_ptr1_i32_staggered_preload_arg: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[4:6], s[0:1], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_add_co_i32 s0, s2, s6 @@ -1641,6 +1674,7 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, ; ; GFX1250-LABEL: ptr1_i8_trailing_unused: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_and_b32 s0, s4, 0xff ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index b2bcb74e4184f..40676cef1bc5e 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -4083,6 +4083,7 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; ; GFX1250-LABEL: compute_mad: ; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 @@ -4189,6 +4190,7 @@ define amdgpu_ps i32 @s_mul_add_1_i32(i32 inreg %x, i32 inreg %y) { ; ; GFX1250-LABEL: s_mul_add_1_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_co_i32 s1, s1, 1 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 @@ -4225,6 +4227,7 @@ define amdgpu_ps i32 @s_mul_add_1_i32_commute(i32 inreg %x, i32 inreg %y) { ; ; GFX1250-LABEL: s_mul_add_1_i32_commute: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_add_co_i32 s1, s1, 1 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_i32 s0, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll index dc2e09dda2193..9309fd0f5c15f 100644 --- a/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll @@ -5,6 +5,7 @@ define amdgpu_kernel void @kernel1() #0 { ; GFX12-LABEL: kernel1: ; GFX12: ; %bb.0: +; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX12-NEXT: s_cmp_eq_u32 0, 0 ; GFX12-NEXT: s_barrier_signal_isfirst -1 ; GFX12-NEXT: s_barrier_wait -1 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index a18847b56a330..67411721f1b41 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -5,6 +5,7 @@ define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -18,6 +19,7 @@ entry: define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b32_idx32: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -30,6 +32,7 @@ entry: define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b32_idxprom_wrong_stride: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] @@ -46,6 +49,7 @@ entry: define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b16_idxprom_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -62,6 +66,7 @@ entry: define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b64_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -75,6 +80,7 @@ entry: define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b96_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -88,6 +94,7 @@ entry: define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b96_idxpromi_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -102,6 +109,7 @@ entry: define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b128_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -115,6 +123,7 @@ entry: define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b32_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset @@ -131,6 +140,7 @@ entry: define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b32_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset @@ -150,6 +160,7 @@ entry: define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b8_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_u8 v0, v0, s[0:1] offset:16 @@ -169,6 +180,7 @@ entry: define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b16_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] scale_offset @@ -187,6 +199,7 @@ entry: define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b16_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset @@ -206,6 +219,7 @@ entry: define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b64_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset @@ -222,6 +236,7 @@ entry: define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b96_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset @@ -238,6 +253,7 @@ entry: define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b96_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset @@ -255,6 +271,7 @@ entry: define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { ; GCN-LABEL: flat_load_b128_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: flat_load_b32 v0, v[0:1] ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset @@ -271,6 +288,7 @@ entry: define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_store_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset ; GCN-NEXT: s_endpgm @@ -284,6 +302,7 @@ entry: define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) { ; GCN-LABEL: flat_store_b16_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset ; GCN-NEXT: s_endpgm @@ -297,6 +316,7 @@ entry: define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_store_b64_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 ; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset ; GCN-NEXT: s_endpgm @@ -310,6 +330,7 @@ entry: define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_atomicrmw_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS ; GCN-NEXT: s_endpgm @@ -323,6 +344,7 @@ entry: define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) { ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] @@ -364,6 +386,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll index a0fca0e2bdc72..d2db2bb19f15b 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll @@ -5,6 +5,7 @@ define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -18,6 +19,7 @@ entry: define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b32_idx32: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -30,6 +32,7 @@ entry: define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b32_idxprom_wrong_stride: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] @@ -46,6 +49,7 @@ entry: define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b16_idxprom_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -62,6 +66,7 @@ entry: define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b64_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -75,6 +80,7 @@ entry: define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b96_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -88,6 +94,7 @@ entry: define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b96_idxpromi_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -102,6 +109,7 @@ entry: define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b128_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -115,6 +123,7 @@ entry: define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b32_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset @@ -131,6 +140,7 @@ entry: define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b32_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset @@ -150,6 +160,7 @@ entry: define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b8_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16 @@ -169,6 +180,7 @@ entry: define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b16_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset @@ -187,6 +199,7 @@ entry: define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b16_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset @@ -206,6 +219,7 @@ entry: define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b64_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset @@ -222,6 +236,7 @@ entry: define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b96_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset @@ -238,6 +253,7 @@ entry: define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b96_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset @@ -255,6 +271,7 @@ entry: define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { ; GCN-LABEL: global_load_b128_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: global_load_b32 v0, v[0:1], off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset @@ -271,6 +288,7 @@ entry: define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_store_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset ; GCN-NEXT: s_endpgm @@ -284,6 +302,7 @@ entry: define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) { ; GCN-LABEL: global_store_b16_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset ; GCN-NEXT: s_endpgm @@ -297,6 +316,7 @@ entry: define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_store_b64_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 ; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset ; GCN-NEXT: s_endpgm @@ -310,6 +330,7 @@ entry: define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_atomicrmw_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS ; GCN-NEXT: s_endpgm @@ -323,6 +344,7 @@ entry: define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) { ; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b64_e32 v[2:3], 1 ; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll index ba4fedf5bb009..438601f8cae35 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll @@ -5,6 +5,7 @@ define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) { ; GCN-LABEL: scratch_load_b32_alloca_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -19,6 +20,7 @@ entry: define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -32,6 +34,7 @@ entry: define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b32_idx32: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -44,6 +47,7 @@ entry: define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: scratch_load_b32 v0, v0, s0 ; GCN-NEXT: s_wait_loadcnt 0x0 @@ -58,6 +62,7 @@ entry: define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b16_idxprom_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -74,6 +79,7 @@ entry: define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b64_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -87,6 +93,7 @@ entry: define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b96_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -100,6 +107,7 @@ entry: define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b96_idxpromi_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -114,6 +122,7 @@ entry: define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_load_b128_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog @@ -127,6 +136,7 @@ entry: define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b32_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset @@ -143,6 +153,7 @@ entry: define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset @@ -160,6 +171,7 @@ entry: define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_u8 v0, v0, s0 offset:16 @@ -179,6 +191,7 @@ entry: define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b16_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_u16 v0, v0, s0 scale_offset @@ -197,6 +210,7 @@ entry: define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset @@ -216,6 +230,7 @@ entry: define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b64_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset @@ -234,6 +249,7 @@ entry: define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b96_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset @@ -250,6 +266,7 @@ entry: define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset @@ -267,6 +284,7 @@ entry: define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { ; GCN-LABEL: scratch_load_b128_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: scratch_load_b32 v0, v0, off ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset @@ -283,6 +301,7 @@ entry: define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_store_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset ; GCN-NEXT: s_endpgm @@ -296,6 +315,7 @@ entry: define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_store_b16_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset ; GCN-NEXT: s_endpgm @@ -309,6 +329,7 @@ entry: define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: scratch_store_b64_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 ; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll index e0ea08d276979..7b9faf47793ce 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll @@ -5,6 +5,7 @@ define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b32_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -21,6 +22,7 @@ entry: define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; SDAG-LABEL: s_load_b32_idx32: ; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-NEXT: s_ashr_i32 s3, s2, 31 ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 @@ -32,6 +34,7 @@ define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 ; ; GISEL-LABEL: s_load_b32_idx32: ; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_ashr_i32 s3, s2, 31 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 @@ -50,6 +53,7 @@ entry: define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; SDAG-LABEL: s_load_b32_idxprom_wrong_stride: ; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; SDAG-NEXT: s_mov_b32 s3, 0 ; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 @@ -61,6 +65,7 @@ define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 ; ; GISEL-LABEL: s_load_b32_idxprom_wrong_stride: ; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s3, 0 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 @@ -80,6 +85,7 @@ entry: define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b16_idxprom_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -97,6 +103,7 @@ entry: define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b64_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b64 s[4:5], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -111,6 +118,7 @@ entry: define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b96_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -126,6 +134,7 @@ entry: define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b128_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -141,6 +150,7 @@ entry: define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b256_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -158,6 +168,7 @@ entry: define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b512_idxprom: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 @@ -179,6 +190,7 @@ entry: define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b32_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset @@ -196,6 +208,7 @@ entry: define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b32_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset @@ -216,6 +229,7 @@ entry: define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b8_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10 @@ -236,6 +250,7 @@ entry: define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b16_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset @@ -255,6 +270,7 @@ entry: define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b16_idxprom_range_ioffset: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset @@ -275,6 +291,7 @@ entry: define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b64_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b64 s[2:3], s[0:1], s4 offset:0x0 scale_offset @@ -292,6 +309,7 @@ entry: define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b96_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset @@ -310,6 +328,7 @@ entry: define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b128_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset @@ -328,6 +347,7 @@ entry: define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b256_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset @@ -348,6 +368,7 @@ entry: define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b512_idxprom_range: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset diff --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll index f79fbd98f1e09..80dd796afb13c 100644 --- a/llvm/test/CodeGen/AMDGPU/sub_u64.ll +++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll @@ -12,6 +12,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_vv(i64 %a, i64 %b) { ; ; GFX1250-LABEL: test_sub_u64_vv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 %a, %b @@ -29,6 +30,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_vs(i64 %a, i64 inreg %b) { ; ; GFX1250-LABEL: test_sub_u64_vs: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_sub_nc_u64_e64 v[0:1], v[0:1], s[0:1] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 %a, %b @@ -46,6 +48,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) { ; ; GFX1250-LABEL: test_sub_u64_sv: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 %a, %b @@ -54,12 +57,20 @@ define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) { } define amdgpu_ps <2 x float> @test_sub_u64_ss(i64 inreg %a, i64 inreg %b) { -; GCN-LABEL: test_sub_u64_ss: -; GCN: ; %bb.0: -; GCN-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX12-LABEL: test_sub_u64_ss: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_ss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 %a, %b %ret = bitcast i64 %sub to <2 x float> ret <2 x float> %ret @@ -75,6 +86,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_inline_lit_v(i64 %a) { ; ; GFX1250-LABEL: test_sub_u64_inline_lit_v: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 5, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 5, %a @@ -92,6 +104,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_v_inline_lit(i64 %a) { ; ; GFX1250-LABEL: test_sub_u64_v_inline_lit: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], -5, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 %a, 5 @@ -109,6 +122,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_small_imm_v(i64 %a) { ; ; GFX1250-LABEL: test_sub_u64_small_imm_v: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 0x1f4, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 500, %a @@ -126,6 +140,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) { ; ; GFX1250-LABEL: test_sub_u64_64bit_imm_v: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 0x13b9ac9ff, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 5294967295, %a @@ -134,13 +149,23 @@ define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) { } define amdgpu_ps <2 x float> @test_sub_u64_small_imm_s(i64 inreg %a) { -; GCN-LABEL: test_sub_u64_small_imm_s: -; GCN: ; %bb.0: -; GCN-NEXT: s_sub_nc_u64 s[0:1], 0x1f4, s[0:1] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX12-LABEL: test_sub_u64_small_imm_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_sub_nc_u64 s[0:1], 0x1f4, s[0:1] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: test_sub_u64_small_imm_s: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-NEXT: s_sub_nc_u64 s[0:1], 0x1f4, s[0:1] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 500, %a %ret = bitcast i64 %sub to <2 x float> ret <2 x float> %ret } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll index 3d74b171400ac..44f82d4b36fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll +++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll @@ -17,6 +17,7 @@ define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i3 ; ; GFX1250-LABEL: v_ashr_pk_i8_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 @@ -58,6 +59,7 @@ define amdgpu_kernel void @v_ashr_pk_u8_i32(ptr addrspace(1) %out, i32 %src0, i3 ; ; GFX1250-LABEL: v_ashr_pk_u8_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c ; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll index 985bcbd6ff4f4..8c32a1a76a4db 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll +++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll @@ -12,12 +12,14 @@ define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %r ; ; GFX1250-SDAG-LABEL: intrinsic_store_system_scope: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: buffer_store_b32 v0, v[2:3], s[0:3], s4 idxen offen scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: intrinsic_store_system_scope: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: buffer_store_b32 v0, v[4:5], s[0:3], s4 idxen offen scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_endpgm @@ -34,6 +36,7 @@ define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) { ; ; GFX1250-SDAG-LABEL: generic_store_volatile: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: global_store_b32 v[2:3], v0, off scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 @@ -41,6 +44,7 @@ define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) { ; ; GFX1250-GISEL-LABEL: generic_store_volatile: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: global_store_b32 v[4:5], v0, off scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index 75817105e74fd..49031e1d818f0 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -18,7 +18,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 ; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 @@ -27,7 +27,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -42,7 +42,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL-NEXT: s_mov_b32 exec_lo, -1 @@ -51,7 +51,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -66,7 +66,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL64-NEXT: s_wait_kmcnt 0x0 ; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL64-NEXT: s_mov_b64 exec, -1 @@ -76,7 +76,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL64-NEXT: s_mov_b64 exec, vcc @@ -91,7 +91,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; GISEL64-NEXT: s_wait_bvhcnt 0x0 ; GISEL64-NEXT: s_wait_kmcnt 0x0 ; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL64-NEXT: s_mov_b64 exec, -1 @@ -101,7 +101,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL64-NEXT: s_mov_b64 exec, vcc @@ -113,7 +113,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x1 +; GFX1250-DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 @@ -122,7 +122,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x1 +; GFX1250-DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 @@ -145,7 +145,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 ; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 @@ -154,7 +154,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -169,7 +169,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL-NEXT: s_mov_b32 exec_lo, -1 @@ -178,7 +178,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -193,7 +193,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL64-NEXT: s_wait_kmcnt 0x0 ; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL64-NEXT: s_mov_b64 exec, -1 @@ -202,7 +202,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL64-NEXT: s_mov_b64 exec, vcc @@ -217,7 +217,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; GISEL64-NEXT: s_wait_bvhcnt 0x0 ; GISEL64-NEXT: s_wait_kmcnt 0x0 ; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL64-NEXT: s_mov_b64 exec, -1 @@ -226,7 +226,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL64-NEXT: s_mov_b64 exec, vcc @@ -238,7 +238,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x1 +; GFX1250-DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 @@ -247,7 +247,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 % ; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x1 +; GFX1250-DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 @@ -362,7 +362,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 ; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 @@ -383,7 +383,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 @@ -401,7 +401,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v2, s32 ; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 @@ -422,7 +422,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v2, off, s32 ; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 @@ -440,7 +440,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL64-NEXT: s_wait_kmcnt 0x0 ; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 @@ -462,7 +462,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 @@ -480,7 +480,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GISEL64-NEXT: s_wait_bvhcnt 0x0 ; GISEL64-NEXT: s_wait_kmcnt 0x0 ; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v2, s32 ; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 @@ -502,7 +502,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v2, off, s32 ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 @@ -517,7 +517,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3 +; GFX1250-DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2, s32 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 @@ -540,7 +540,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { ; GFX1250-DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3 +; GFX1250-DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 @@ -906,7 +906,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 ; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 @@ -921,7 +921,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -936,7 +936,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL-NEXT: s_mov_b32 exec_lo, -1 @@ -951,7 +951,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo @@ -966,7 +966,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL64-NEXT: s_wait_kmcnt 0x0 ; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL64-NEXT: s_mov_b64 exec, -1 @@ -981,7 +981,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL64-NEXT: s_mov_b64 exec, vcc @@ -996,7 +996,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; GISEL64-NEXT: s_wait_bvhcnt 0x0 ; GISEL64-NEXT: s_wait_kmcnt 0x0 ; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL64-NEXT: s_mov_b64 exec, -1 @@ -1011,7 +1011,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL64-NEXT: s_mov_b64 exec, vcc @@ -1023,7 +1023,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x1 +; GFX1250-DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 @@ -1038,7 +1038,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { ; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x1 +; GFX1250-DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 @@ -1067,7 +1067,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 ; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1080,7 +1080,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1097,7 +1097,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1110,7 +1110,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1127,7 +1127,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL64-NEXT: s_wait_kmcnt 0x0 ; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1142,7 +1142,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1159,7 +1159,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; GISEL64-NEXT: s_wait_bvhcnt 0x0 ; GISEL64-NEXT: s_wait_kmcnt 0x0 ; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 -; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1174,7 +1174,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 -; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1188,7 +1188,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3 +; GFX1250-DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1201,7 +1201,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { ; GFX1250-DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX1250-DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3 +; GFX1250-DAGISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1225,7 +1225,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 ; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 -; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1242,7 +1242,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; DAGISEL-NEXT: scratch_store_b32 off, v5, s11 ; DAGISEL-NEXT: s_wait_alu 0xfffe ; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 -; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1261,7 +1261,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: s_xor_saveexec_b32 s34, -1 -; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: s_clause 0x5 ; 24-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1283,7 +1283,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11 ; GISEL-NEXT: scratch_store_b32 off, v5, s11 ; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1 -; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1302,7 +1302,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL64-NEXT: s_wait_kmcnt 0x0 ; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: s_clause 0x5 ; 24-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1322,7 +1322,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11 ; DAGISEL64-NEXT: s_wait_alu 0xfffe ; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 -; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: s_clause 0x5 ; 24-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1341,7 +1341,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GISEL64-NEXT: s_wait_bvhcnt 0x0 ; GISEL64-NEXT: s_wait_kmcnt 0x0 ; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: s_clause 0x5 ; 24-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1365,7 +1365,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 ; GISEL64-NEXT: scratch_store_b32 off, v5, s11 ; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1 -; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: s_clause 0x5 ; 24-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1381,7 +1381,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x5 +; GFX1250-DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -1399,7 +1399,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v5, s11 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x5 +; GFX1250-DAGISEL-NEXT: s_clause 0x5 ; 24-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -1429,7 +1429,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: s_mov_b32 s0, s33 ; DAGISEL-NEXT: s_mov_b32 s33, s32 ; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -1462,7 +1462,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -1495,7 +1495,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 ; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 ; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 ; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 ; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 @@ -1528,7 +1528,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 ; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 ; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 ; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 ; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 @@ -1561,7 +1561,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 ; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 ; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 -; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: s_clause 0xf ; 64-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 ; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 ; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 @@ -1600,7 +1600,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; DAGISEL-NEXT: s_mov_b32 s32, s33 ; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -1633,7 +1633,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -1666,7 +1666,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 ; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 ; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 ; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 ; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 @@ -1699,7 +1699,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 ; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 ; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 ; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 ; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 @@ -1732,7 +1732,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 ; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 ; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 -; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: s_clause 0xf ; 64-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 ; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 ; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 @@ -1765,7 +1765,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: s_mov_b32 s0, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_xor_saveexec_b32 s4, -1 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -1798,7 +1798,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -1831,7 +1831,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 ; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 ; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 ; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 ; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 @@ -1864,7 +1864,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 ; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 ; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 ; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 ; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 @@ -1897,7 +1897,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 ; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 ; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 -; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 ; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 ; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 @@ -1936,7 +1936,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -1969,7 +1969,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -2002,7 +2002,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 ; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 ; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 ; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 ; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 @@ -2035,7 +2035,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 ; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 ; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 ; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 ; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 @@ -2068,7 +2068,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 ; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 ; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 -; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 ; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 ; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 @@ -2101,7 +2101,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: s_mov_b32 s0, s33 ; DAGISEL64-NEXT: s_mov_b32 s33, s32 ; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -2134,7 +2134,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -2167,7 +2167,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 ; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 ; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 ; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 ; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 @@ -2200,7 +2200,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 ; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 ; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 ; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 ; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 @@ -2233,7 +2233,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 ; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 ; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 -; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: s_clause 0xf ; 64-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 ; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 ; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 @@ -2274,7 +2274,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; DAGISEL64-NEXT: s_mov_b32 s32, s33 ; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -2307,7 +2307,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -2340,7 +2340,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 ; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 ; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 ; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 ; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 @@ -2373,7 +2373,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 ; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 ; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 ; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 ; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 @@ -2406,7 +2406,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 ; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 ; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 -; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: s_clause 0xf ; 64-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 ; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 ; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 @@ -2439,7 +2439,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: s_mov_b32 s0, s33 ; GISEL64-NEXT: s_mov_b32 s33, s32 ; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -2472,7 +2472,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -2505,7 +2505,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 ; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 ; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 ; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 ; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 @@ -2538,7 +2538,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 ; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 ; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 ; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 ; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 @@ -2571,7 +2571,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 ; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 ; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 -; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 ; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 ; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 @@ -2612,7 +2612,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GISEL64-NEXT: s_mov_b32 s32, s33 ; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -2645,7 +2645,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -2678,7 +2678,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 ; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 ; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 ; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 ; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 @@ -2711,7 +2711,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 ; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 ; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 ; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 ; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 @@ -2744,7 +2744,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 ; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 ; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 -; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 ; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 ; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 @@ -2774,7 +2774,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s0, s33 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s32 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -2838,7 +2838,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 @@ -2902,7 +2902,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 @@ -2967,7 +2967,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v298*/, s33 offset:748 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v299*/, s33 offset:752 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v44 /*v300*/, s33 offset:756 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v45 /*v301*/, s33 offset:760 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v46 /*v302*/, s33 offset:764 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v47 /*v303*/, s33 offset:768 @@ -3031,7 +3031,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v361*/, s33 offset:1000 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v362*/, s33 offset:1004 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v107 /*v363*/, s33 offset:1008 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v108 /*v364*/, s33 offset:1012 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v109 /*v365*/, s33 offset:1016 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v110 /*v366*/, s33 offset:1020 @@ -3095,7 +3095,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v424*/, s33 offset:1252 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v425*/, s33 offset:1256 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v170 /*v426*/, s33 offset:1260 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v171 /*v427*/, s33 offset:1264 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v172 /*v428*/, s33 offset:1268 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v173 /*v429*/, s33 offset:1272 @@ -3159,7 +3159,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v487*/, s33 offset:1504 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v488*/, s33 offset:1508 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v233 /*v489*/, s33 offset:1512 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v234 /*v490*/, s33 offset:1516 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v235 /*v491*/, s33 offset:1520 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v236 /*v492*/, s33 offset:1524 @@ -3224,7 +3224,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v550*/, s33 offset:1756 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v551*/, s33 offset:1760 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40 /*v552*/, s33 offset:1764 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41 /*v553*/, s33 offset:1768 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v554*/, s33 offset:1772 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v555*/, s33 offset:1776 @@ -3288,7 +3288,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v613*/, s33 offset:2008 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v614*/, s33 offset:2012 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v103 /*v615*/, s33 offset:2016 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v104 /*v616*/, s33 offset:2020 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v617*/, s33 offset:2024 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v618*/, s33 offset:2028 @@ -3352,7 +3352,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v676*/, s33 offset:2260 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v677*/, s33 offset:2264 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v166 /*v678*/, s33 offset:2268 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v167 /*v679*/, s33 offset:2272 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v680*/, s33 offset:2276 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v681*/, s33 offset:2280 @@ -3416,7 +3416,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v739*/, s33 offset:2512 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v740*/, s33 offset:2516 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v229 /*v741*/, s33 offset:2520 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v230 /*v742*/, s33 offset:2524 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v743*/, s33 offset:2528 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v744*/, s33 offset:2532 @@ -3481,7 +3481,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v34 /*v802*/, s33 offset:2764 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v35 /*v803*/, s33 offset:2768 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v36 /*v804*/, s33 offset:2772 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v37 /*v805*/, s33 offset:2776 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v806*/, s33 offset:2780 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v807*/, s33 offset:2784 @@ -3545,7 +3545,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97 /*v865*/, s33 offset:3016 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v98 /*v866*/, s33 offset:3020 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v99 /*v867*/, s33 offset:3024 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v100 /*v868*/, s33 offset:3028 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v869*/, s33 offset:3032 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v870*/, s33 offset:3036 @@ -3609,7 +3609,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v160 /*v928*/, s33 offset:3268 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v161 /*v929*/, s33 offset:3272 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v162 /*v930*/, s33 offset:3276 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v163 /*v931*/, s33 offset:3280 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v932*/, s33 offset:3284 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v933*/, s33 offset:3288 @@ -3673,7 +3673,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v223 /*v991*/, s33 offset:3520 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224 /*v992*/, s33 offset:3524 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v225 /*v993*/, s33 offset:3528 -; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: s_clause 0x1d ; 120-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v226 /*v994*/, s33 offset:3532 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v995*/, s33 offset:3536 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v996*/, s33 offset:3540 @@ -3727,7 +3727,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s32, s33 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -3791,7 +3791,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 @@ -3855,7 +3855,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 @@ -3920,7 +3920,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v298*/, off, s33 offset:748 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v299*/, off, s33 offset:752 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v44 /*v300*/, off, s33 offset:756 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v45 /*v301*/, off, s33 offset:760 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v46 /*v302*/, off, s33 offset:764 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v47 /*v303*/, off, s33 offset:768 @@ -3984,7 +3984,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v361*/, off, s33 offset:1000 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v362*/, off, s33 offset:1004 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v107 /*v363*/, off, s33 offset:1008 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v108 /*v364*/, off, s33 offset:1012 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v109 /*v365*/, off, s33 offset:1016 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v110 /*v366*/, off, s33 offset:1020 @@ -4048,7 +4048,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v424*/, off, s33 offset:1252 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v425*/, off, s33 offset:1256 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v170 /*v426*/, off, s33 offset:1260 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v171 /*v427*/, off, s33 offset:1264 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v172 /*v428*/, off, s33 offset:1268 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v173 /*v429*/, off, s33 offset:1272 @@ -4112,7 +4112,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v487*/, off, s33 offset:1504 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v488*/, off, s33 offset:1508 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v233 /*v489*/, off, s33 offset:1512 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v234 /*v490*/, off, s33 offset:1516 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v235 /*v491*/, off, s33 offset:1520 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v236 /*v492*/, off, s33 offset:1524 @@ -4177,7 +4177,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v550*/, off, s33 offset:1756 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v551*/, off, s33 offset:1760 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40 /*v552*/, off, s33 offset:1764 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v41 /*v553*/, off, s33 offset:1768 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v554*/, off, s33 offset:1772 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v555*/, off, s33 offset:1776 @@ -4241,7 +4241,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v613*/, off, s33 offset:2008 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v614*/, off, s33 offset:2012 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v103 /*v615*/, off, s33 offset:2016 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v104 /*v616*/, off, s33 offset:2020 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v617*/, off, s33 offset:2024 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v618*/, off, s33 offset:2028 @@ -4305,7 +4305,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v676*/, off, s33 offset:2260 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v677*/, off, s33 offset:2264 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v166 /*v678*/, off, s33 offset:2268 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v167 /*v679*/, off, s33 offset:2272 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v680*/, off, s33 offset:2276 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v681*/, off, s33 offset:2280 @@ -4369,7 +4369,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v739*/, off, s33 offset:2512 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v740*/, off, s33 offset:2516 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v229 /*v741*/, off, s33 offset:2520 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v230 /*v742*/, off, s33 offset:2524 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v743*/, off, s33 offset:2528 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v744*/, off, s33 offset:2532 @@ -4434,7 +4434,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v34 /*v802*/, off, s33 offset:2764 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v35 /*v803*/, off, s33 offset:2768 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v36 /*v804*/, off, s33 offset:2772 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v37 /*v805*/, off, s33 offset:2776 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v806*/, off, s33 offset:2780 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v807*/, off, s33 offset:2784 @@ -4498,7 +4498,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97 /*v865*/, off, s33 offset:3016 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v98 /*v866*/, off, s33 offset:3020 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v99 /*v867*/, off, s33 offset:3024 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v100 /*v868*/, off, s33 offset:3028 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v869*/, off, s33 offset:3032 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v870*/, off, s33 offset:3036 @@ -4562,7 +4562,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v160 /*v928*/, off, s33 offset:3268 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v161 /*v929*/, off, s33 offset:3272 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v162 /*v930*/, off, s33 offset:3276 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v163 /*v931*/, off, s33 offset:3280 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v932*/, off, s33 offset:3284 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v933*/, off, s33 offset:3288 @@ -4626,7 +4626,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v223 /*v991*/, off, s33 offset:3520 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224 /*v992*/, off, s33 offset:3524 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v225 /*v993*/, off, s33 offset:3528 -; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: s_clause 0x1d ; 120-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v226 /*v994*/, off, s33 offset:3532 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v995*/, off, s33 offset:3536 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v996*/, off, s33 offset:3540 @@ -4677,7 +4677,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 ; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -4710,7 +4710,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_store_b32 off, v29, s32 offset:116 ; DAGISEL-NEXT: scratch_store_b32 off, v30, s32 offset:120 ; DAGISEL-NEXT: scratch_store_b32 off, v31, s32 offset:124 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v32, s32 offset:128 ; DAGISEL-NEXT: scratch_store_b32 off, v33, s32 offset:132 ; DAGISEL-NEXT: scratch_store_b32 off, v34, s32 offset:136 @@ -4743,7 +4743,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_store_b32 off, v85, s32 offset:244 ; DAGISEL-NEXT: scratch_store_b32 off, v86, s32 offset:248 ; DAGISEL-NEXT: scratch_store_b32 off, v87, s32 offset:252 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v96, s32 offset:256 ; DAGISEL-NEXT: scratch_store_b32 off, v97, s32 offset:260 ; DAGISEL-NEXT: scratch_store_b32 off, v98, s32 offset:264 @@ -4776,7 +4776,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_store_b32 off, v149, s32 offset:372 ; DAGISEL-NEXT: scratch_store_b32 off, v150, s32 offset:376 ; DAGISEL-NEXT: scratch_store_b32 off, v151, s32 offset:380 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v160, s32 offset:384 ; DAGISEL-NEXT: scratch_store_b32 off, v161, s32 offset:388 ; DAGISEL-NEXT: scratch_store_b32 off, v162, s32 offset:392 @@ -4809,7 +4809,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_store_b32 off, v213, s32 offset:500 ; DAGISEL-NEXT: scratch_store_b32 off, v214, s32 offset:504 ; DAGISEL-NEXT: scratch_store_b32 off, v215, s32 offset:508 -; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: s_clause 0xf ; 64-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v224, s32 offset:512 ; DAGISEL-NEXT: scratch_store_b32 off, v225, s32 offset:516 ; DAGISEL-NEXT: scratch_store_b32 off, v226, s32 offset:520 @@ -4833,7 +4833,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: v_swap_b32 v0, v1 ; DAGISEL-NEXT: s_wait_alu 0xfffe ; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -4866,7 +4866,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_load_b32 v29, off, s32 offset:116 ; DAGISEL-NEXT: scratch_load_b32 v30, off, s32 offset:120 ; DAGISEL-NEXT: scratch_load_b32 v31, off, s32 offset:124 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v32, off, s32 offset:128 ; DAGISEL-NEXT: scratch_load_b32 v33, off, s32 offset:132 ; DAGISEL-NEXT: scratch_load_b32 v34, off, s32 offset:136 @@ -4899,7 +4899,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_load_b32 v85, off, s32 offset:244 ; DAGISEL-NEXT: scratch_load_b32 v86, off, s32 offset:248 ; DAGISEL-NEXT: scratch_load_b32 v87, off, s32 offset:252 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v96, off, s32 offset:256 ; DAGISEL-NEXT: scratch_load_b32 v97, off, s32 offset:260 ; DAGISEL-NEXT: scratch_load_b32 v98, off, s32 offset:264 @@ -4932,7 +4932,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_load_b32 v149, off, s32 offset:372 ; DAGISEL-NEXT: scratch_load_b32 v150, off, s32 offset:376 ; DAGISEL-NEXT: scratch_load_b32 v151, off, s32 offset:380 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v160, off, s32 offset:384 ; DAGISEL-NEXT: scratch_load_b32 v161, off, s32 offset:388 ; DAGISEL-NEXT: scratch_load_b32 v162, off, s32 offset:392 @@ -4965,7 +4965,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL-NEXT: scratch_load_b32 v213, off, s32 offset:500 ; DAGISEL-NEXT: scratch_load_b32 v214, off, s32 offset:504 ; DAGISEL-NEXT: scratch_load_b32 v215, off, s32 offset:508 -; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: s_clause 0xf ; 64-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v224, off, s32 offset:512 ; DAGISEL-NEXT: scratch_load_b32 v225, off, s32 offset:516 ; DAGISEL-NEXT: scratch_load_b32 v226, off, s32 offset:520 @@ -4993,7 +4993,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 ; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -5026,7 +5026,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_store_b32 off, v29, s32 offset:116 ; GISEL-NEXT: scratch_store_b32 off, v30, s32 offset:120 ; GISEL-NEXT: scratch_store_b32 off, v31, s32 offset:124 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v32, s32 offset:128 ; GISEL-NEXT: scratch_store_b32 off, v33, s32 offset:132 ; GISEL-NEXT: scratch_store_b32 off, v34, s32 offset:136 @@ -5059,7 +5059,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_store_b32 off, v85, s32 offset:244 ; GISEL-NEXT: scratch_store_b32 off, v86, s32 offset:248 ; GISEL-NEXT: scratch_store_b32 off, v87, s32 offset:252 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v96, s32 offset:256 ; GISEL-NEXT: scratch_store_b32 off, v97, s32 offset:260 ; GISEL-NEXT: scratch_store_b32 off, v98, s32 offset:264 @@ -5092,7 +5092,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_store_b32 off, v149, s32 offset:372 ; GISEL-NEXT: scratch_store_b32 off, v150, s32 offset:376 ; GISEL-NEXT: scratch_store_b32 off, v151, s32 offset:380 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v160, s32 offset:384 ; GISEL-NEXT: scratch_store_b32 off, v161, s32 offset:388 ; GISEL-NEXT: scratch_store_b32 off, v162, s32 offset:392 @@ -5125,7 +5125,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_store_b32 off, v213, s32 offset:500 ; GISEL-NEXT: scratch_store_b32 off, v214, s32 offset:504 ; GISEL-NEXT: scratch_store_b32 off, v215, s32 offset:508 -; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v224, s32 offset:512 ; GISEL-NEXT: scratch_store_b32 off, v225, s32 offset:516 ; GISEL-NEXT: scratch_store_b32 off, v226, s32 offset:520 @@ -5149,7 +5149,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: s_mov_b32 s37, gfx_callee@abs32@hi ; GISEL-NEXT: s_wait_alu 0xfffe ; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -5182,7 +5182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_load_b32 v29, off, s32 offset:116 ; GISEL-NEXT: scratch_load_b32 v30, off, s32 offset:120 ; GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:124 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v32, off, s32 offset:128 ; GISEL-NEXT: scratch_load_b32 v33, off, s32 offset:132 ; GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:136 @@ -5215,7 +5215,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_load_b32 v85, off, s32 offset:244 ; GISEL-NEXT: scratch_load_b32 v86, off, s32 offset:248 ; GISEL-NEXT: scratch_load_b32 v87, off, s32 offset:252 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v96, off, s32 offset:256 ; GISEL-NEXT: scratch_load_b32 v97, off, s32 offset:260 ; GISEL-NEXT: scratch_load_b32 v98, off, s32 offset:264 @@ -5248,7 +5248,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_load_b32 v149, off, s32 offset:372 ; GISEL-NEXT: scratch_load_b32 v150, off, s32 offset:376 ; GISEL-NEXT: scratch_load_b32 v151, off, s32 offset:380 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v160, off, s32 offset:384 ; GISEL-NEXT: scratch_load_b32 v161, off, s32 offset:388 ; GISEL-NEXT: scratch_load_b32 v162, off, s32 offset:392 @@ -5281,7 +5281,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL-NEXT: scratch_load_b32 v213, off, s32 offset:500 ; GISEL-NEXT: scratch_load_b32 v214, off, s32 offset:504 ; GISEL-NEXT: scratch_load_b32 v215, off, s32 offset:508 -; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v224, off, s32 offset:512 ; GISEL-NEXT: scratch_load_b32 v225, off, s32 offset:516 ; GISEL-NEXT: scratch_load_b32 v226, off, s32 offset:520 @@ -5309,7 +5309,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL64-NEXT: s_wait_kmcnt 0x0 ; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -5342,7 +5342,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_store_b32 off, v29, s32 offset:116 ; DAGISEL64-NEXT: scratch_store_b32 off, v30, s32 offset:120 ; DAGISEL64-NEXT: scratch_store_b32 off, v31, s32 offset:124 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v32, s32 offset:128 ; DAGISEL64-NEXT: scratch_store_b32 off, v33, s32 offset:132 ; DAGISEL64-NEXT: scratch_store_b32 off, v34, s32 offset:136 @@ -5375,7 +5375,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_store_b32 off, v85, s32 offset:244 ; DAGISEL64-NEXT: scratch_store_b32 off, v86, s32 offset:248 ; DAGISEL64-NEXT: scratch_store_b32 off, v87, s32 offset:252 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v96, s32 offset:256 ; DAGISEL64-NEXT: scratch_store_b32 off, v97, s32 offset:260 ; DAGISEL64-NEXT: scratch_store_b32 off, v98, s32 offset:264 @@ -5408,7 +5408,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_store_b32 off, v149, s32 offset:372 ; DAGISEL64-NEXT: scratch_store_b32 off, v150, s32 offset:376 ; DAGISEL64-NEXT: scratch_store_b32 off, v151, s32 offset:380 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v160, s32 offset:384 ; DAGISEL64-NEXT: scratch_store_b32 off, v161, s32 offset:388 ; DAGISEL64-NEXT: scratch_store_b32 off, v162, s32 offset:392 @@ -5441,7 +5441,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_store_b32 off, v213, s32 offset:500 ; DAGISEL64-NEXT: scratch_store_b32 off, v214, s32 offset:504 ; DAGISEL64-NEXT: scratch_store_b32 off, v215, s32 offset:508 -; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: s_clause 0xf ; 64-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v224, s32 offset:512 ; DAGISEL64-NEXT: scratch_store_b32 off, v225, s32 offset:516 ; DAGISEL64-NEXT: scratch_store_b32 off, v226, s32 offset:520 @@ -5465,7 +5465,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: v_swap_b32 v0, v1 ; DAGISEL64-NEXT: s_wait_alu 0xfffe ; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -5498,7 +5498,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_load_b32 v29, off, s32 offset:116 ; DAGISEL64-NEXT: scratch_load_b32 v30, off, s32 offset:120 ; DAGISEL64-NEXT: scratch_load_b32 v31, off, s32 offset:124 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v32, off, s32 offset:128 ; DAGISEL64-NEXT: scratch_load_b32 v33, off, s32 offset:132 ; DAGISEL64-NEXT: scratch_load_b32 v34, off, s32 offset:136 @@ -5531,7 +5531,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_load_b32 v85, off, s32 offset:244 ; DAGISEL64-NEXT: scratch_load_b32 v86, off, s32 offset:248 ; DAGISEL64-NEXT: scratch_load_b32 v87, off, s32 offset:252 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v96, off, s32 offset:256 ; DAGISEL64-NEXT: scratch_load_b32 v97, off, s32 offset:260 ; DAGISEL64-NEXT: scratch_load_b32 v98, off, s32 offset:264 @@ -5564,7 +5564,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_load_b32 v149, off, s32 offset:372 ; DAGISEL64-NEXT: scratch_load_b32 v150, off, s32 offset:376 ; DAGISEL64-NEXT: scratch_load_b32 v151, off, s32 offset:380 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v160, off, s32 offset:384 ; DAGISEL64-NEXT: scratch_load_b32 v161, off, s32 offset:388 ; DAGISEL64-NEXT: scratch_load_b32 v162, off, s32 offset:392 @@ -5597,7 +5597,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; DAGISEL64-NEXT: scratch_load_b32 v213, off, s32 offset:500 ; DAGISEL64-NEXT: scratch_load_b32 v214, off, s32 offset:504 ; DAGISEL64-NEXT: scratch_load_b32 v215, off, s32 offset:508 -; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: s_clause 0xf ; 64-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v224, off, s32 offset:512 ; DAGISEL64-NEXT: scratch_load_b32 v225, off, s32 offset:516 ; DAGISEL64-NEXT: scratch_load_b32 v226, off, s32 offset:520 @@ -5625,7 +5625,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: s_wait_bvhcnt 0x0 ; GISEL64-NEXT: s_wait_kmcnt 0x0 ; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -5658,7 +5658,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_store_b32 off, v29, s32 offset:116 ; GISEL64-NEXT: scratch_store_b32 off, v30, s32 offset:120 ; GISEL64-NEXT: scratch_store_b32 off, v31, s32 offset:124 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v32, s32 offset:128 ; GISEL64-NEXT: scratch_store_b32 off, v33, s32 offset:132 ; GISEL64-NEXT: scratch_store_b32 off, v34, s32 offset:136 @@ -5691,7 +5691,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_store_b32 off, v85, s32 offset:244 ; GISEL64-NEXT: scratch_store_b32 off, v86, s32 offset:248 ; GISEL64-NEXT: scratch_store_b32 off, v87, s32 offset:252 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v96, s32 offset:256 ; GISEL64-NEXT: scratch_store_b32 off, v97, s32 offset:260 ; GISEL64-NEXT: scratch_store_b32 off, v98, s32 offset:264 @@ -5724,7 +5724,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_store_b32 off, v149, s32 offset:372 ; GISEL64-NEXT: scratch_store_b32 off, v150, s32 offset:376 ; GISEL64-NEXT: scratch_store_b32 off, v151, s32 offset:380 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v160, s32 offset:384 ; GISEL64-NEXT: scratch_store_b32 off, v161, s32 offset:388 ; GISEL64-NEXT: scratch_store_b32 off, v162, s32 offset:392 @@ -5757,7 +5757,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_store_b32 off, v213, s32 offset:500 ; GISEL64-NEXT: scratch_store_b32 off, v214, s32 offset:504 ; GISEL64-NEXT: scratch_store_b32 off, v215, s32 offset:508 -; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v224, s32 offset:512 ; GISEL64-NEXT: scratch_store_b32 off, v225, s32 offset:516 ; GISEL64-NEXT: scratch_store_b32 off, v226, s32 offset:520 @@ -5781,7 +5781,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: s_mov_b32 s37, gfx_callee@abs32@hi ; GISEL64-NEXT: s_wait_alu 0xfffe ; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -5814,7 +5814,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_load_b32 v29, off, s32 offset:116 ; GISEL64-NEXT: scratch_load_b32 v30, off, s32 offset:120 ; GISEL64-NEXT: scratch_load_b32 v31, off, s32 offset:124 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v32, off, s32 offset:128 ; GISEL64-NEXT: scratch_load_b32 v33, off, s32 offset:132 ; GISEL64-NEXT: scratch_load_b32 v34, off, s32 offset:136 @@ -5847,7 +5847,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_load_b32 v85, off, s32 offset:244 ; GISEL64-NEXT: scratch_load_b32 v86, off, s32 offset:248 ; GISEL64-NEXT: scratch_load_b32 v87, off, s32 offset:252 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v96, off, s32 offset:256 ; GISEL64-NEXT: scratch_load_b32 v97, off, s32 offset:260 ; GISEL64-NEXT: scratch_load_b32 v98, off, s32 offset:264 @@ -5880,7 +5880,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_load_b32 v149, off, s32 offset:372 ; GISEL64-NEXT: scratch_load_b32 v150, off, s32 offset:376 ; GISEL64-NEXT: scratch_load_b32 v151, off, s32 offset:380 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v160, off, s32 offset:384 ; GISEL64-NEXT: scratch_load_b32 v161, off, s32 offset:388 ; GISEL64-NEXT: scratch_load_b32 v162, off, s32 offset:392 @@ -5913,7 +5913,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GISEL64-NEXT: scratch_load_b32 v213, off, s32 offset:500 ; GISEL64-NEXT: scratch_load_b32 v214, off, s32 offset:504 ; GISEL64-NEXT: scratch_load_b32 v215, off, s32 offset:508 -; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v224, off, s32 offset:512 ; GISEL64-NEXT: scratch_load_b32 v225, off, s32 offset:516 ; GISEL64-NEXT: scratch_load_b32 v226, off, s32 offset:520 @@ -5938,7 +5938,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 @@ -6002,7 +6002,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v84, s32 offset:240 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v85, s32 offset:244 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v86, s32 offset:248 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v87, s32 offset:252 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v96, s32 offset:256 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97, s32 offset:260 @@ -6066,7 +6066,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v211, s32 offset:492 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v212, s32 offset:496 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v213, s32 offset:500 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v214, s32 offset:504 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v215, s32 offset:508 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224, s32 offset:512 @@ -6131,7 +6131,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v298*/, s32 offset:744 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v299*/, s32 offset:748 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v44 /*v300*/, s32 offset:752 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v45 /*v301*/, s32 offset:756 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v46 /*v302*/, s32 offset:760 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v47 /*v303*/, s32 offset:764 @@ -6195,7 +6195,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v361*/, s32 offset:996 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v362*/, s32 offset:1000 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v107 /*v363*/, s32 offset:1004 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v108 /*v364*/, s32 offset:1008 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v109 /*v365*/, s32 offset:1012 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v110 /*v366*/, s32 offset:1016 @@ -6259,7 +6259,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v424*/, s32 offset:1248 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v425*/, s32 offset:1252 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v170 /*v426*/, s32 offset:1256 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v171 /*v427*/, s32 offset:1260 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v172 /*v428*/, s32 offset:1264 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v173 /*v429*/, s32 offset:1268 @@ -6323,7 +6323,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v487*/, s32 offset:1500 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v488*/, s32 offset:1504 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v233 /*v489*/, s32 offset:1508 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v234 /*v490*/, s32 offset:1512 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v235 /*v491*/, s32 offset:1516 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v236 /*v492*/, s32 offset:1520 @@ -6388,7 +6388,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v550*/, s32 offset:1752 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v551*/, s32 offset:1756 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40 /*v552*/, s32 offset:1760 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41 /*v553*/, s32 offset:1764 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v554*/, s32 offset:1768 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v555*/, s32 offset:1772 @@ -6452,7 +6452,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v613*/, s32 offset:2004 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v614*/, s32 offset:2008 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v103 /*v615*/, s32 offset:2012 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v104 /*v616*/, s32 offset:2016 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v617*/, s32 offset:2020 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v618*/, s32 offset:2024 @@ -6516,7 +6516,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v676*/, s32 offset:2256 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v677*/, s32 offset:2260 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v166 /*v678*/, s32 offset:2264 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v167 /*v679*/, s32 offset:2268 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v680*/, s32 offset:2272 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v681*/, s32 offset:2276 @@ -6580,7 +6580,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v739*/, s32 offset:2508 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v740*/, s32 offset:2512 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v229 /*v741*/, s32 offset:2516 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v230 /*v742*/, s32 offset:2520 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v743*/, s32 offset:2524 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v744*/, s32 offset:2528 @@ -6645,7 +6645,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v34 /*v802*/, s32 offset:2760 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v35 /*v803*/, s32 offset:2764 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v36 /*v804*/, s32 offset:2768 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v37 /*v805*/, s32 offset:2772 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v806*/, s32 offset:2776 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v807*/, s32 offset:2780 @@ -6709,7 +6709,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97 /*v865*/, s32 offset:3012 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v98 /*v866*/, s32 offset:3016 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v99 /*v867*/, s32 offset:3020 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v100 /*v868*/, s32 offset:3024 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v869*/, s32 offset:3028 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v870*/, s32 offset:3032 @@ -6773,7 +6773,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v160 /*v928*/, s32 offset:3264 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v161 /*v929*/, s32 offset:3268 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v162 /*v930*/, s32 offset:3272 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v163 /*v931*/, s32 offset:3276 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v932*/, s32 offset:3280 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v933*/, s32 offset:3284 @@ -6837,7 +6837,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v223 /*v991*/, s32 offset:3516 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224 /*v992*/, s32 offset:3520 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v225 /*v993*/, s32 offset:3524 -; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: s_clause 0x1d ; 120-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v226 /*v994*/, s32 offset:3528 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v995*/, s32 offset:3532 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v996*/, s32 offset:3536 @@ -6875,7 +6875,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 @@ -6939,7 +6939,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v84, off, s32 offset:240 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v85, off, s32 offset:244 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v86, off, s32 offset:248 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v87, off, s32 offset:252 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v96, off, s32 offset:256 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97, off, s32 offset:260 @@ -7003,7 +7003,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v211, off, s32 offset:492 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v212, off, s32 offset:496 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v213, off, s32 offset:500 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v214, off, s32 offset:504 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v215, off, s32 offset:508 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224, off, s32 offset:512 @@ -7068,7 +7068,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v298*/, off, s32 offset:744 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v299*/, off, s32 offset:748 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v44 /*v300*/, off, s32 offset:752 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v45 /*v301*/, off, s32 offset:756 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v46 /*v302*/, off, s32 offset:760 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v47 /*v303*/, off, s32 offset:764 @@ -7132,7 +7132,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v361*/, off, s32 offset:996 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v362*/, off, s32 offset:1000 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v107 /*v363*/, off, s32 offset:1004 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v108 /*v364*/, off, s32 offset:1008 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v109 /*v365*/, off, s32 offset:1012 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v110 /*v366*/, off, s32 offset:1016 @@ -7196,7 +7196,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v424*/, off, s32 offset:1248 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v425*/, off, s32 offset:1252 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v170 /*v426*/, off, s32 offset:1256 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v171 /*v427*/, off, s32 offset:1260 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v172 /*v428*/, off, s32 offset:1264 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v173 /*v429*/, off, s32 offset:1268 @@ -7260,7 +7260,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v487*/, off, s32 offset:1500 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v488*/, off, s32 offset:1504 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v233 /*v489*/, off, s32 offset:1508 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v234 /*v490*/, off, s32 offset:1512 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v235 /*v491*/, off, s32 offset:1516 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v236 /*v492*/, off, s32 offset:1520 @@ -7325,7 +7325,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v550*/, off, s32 offset:1752 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v551*/, off, s32 offset:1756 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40 /*v552*/, off, s32 offset:1760 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v41 /*v553*/, off, s32 offset:1764 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v554*/, off, s32 offset:1768 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v555*/, off, s32 offset:1772 @@ -7389,7 +7389,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v613*/, off, s32 offset:2004 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v614*/, off, s32 offset:2008 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v103 /*v615*/, off, s32 offset:2012 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v104 /*v616*/, off, s32 offset:2016 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v617*/, off, s32 offset:2020 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v618*/, off, s32 offset:2024 @@ -7453,7 +7453,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v676*/, off, s32 offset:2256 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v677*/, off, s32 offset:2260 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v166 /*v678*/, off, s32 offset:2264 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v167 /*v679*/, off, s32 offset:2268 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v680*/, off, s32 offset:2272 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v681*/, off, s32 offset:2276 @@ -7517,7 +7517,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v739*/, off, s32 offset:2508 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v740*/, off, s32 offset:2512 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v229 /*v741*/, off, s32 offset:2516 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v230 /*v742*/, off, s32 offset:2520 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v743*/, off, s32 offset:2524 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v744*/, off, s32 offset:2528 @@ -7582,7 +7582,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v34 /*v802*/, off, s32 offset:2760 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v35 /*v803*/, off, s32 offset:2764 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v36 /*v804*/, off, s32 offset:2768 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v37 /*v805*/, off, s32 offset:2772 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v806*/, off, s32 offset:2776 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v807*/, off, s32 offset:2780 @@ -7646,7 +7646,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97 /*v865*/, off, s32 offset:3012 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v98 /*v866*/, off, s32 offset:3016 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v99 /*v867*/, off, s32 offset:3020 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v100 /*v868*/, off, s32 offset:3024 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v869*/, off, s32 offset:3028 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v870*/, off, s32 offset:3032 @@ -7710,7 +7710,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v160 /*v928*/, off, s32 offset:3264 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v161 /*v929*/, off, s32 offset:3268 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v162 /*v930*/, off, s32 offset:3272 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v163 /*v931*/, off, s32 offset:3276 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v932*/, off, s32 offset:3280 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v933*/, off, s32 offset:3284 @@ -7774,7 +7774,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v223 /*v991*/, off, s32 offset:3516 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224 /*v992*/, off, s32 offset:3520 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v225 /*v993*/, off, s32 offset:3524 -; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: s_clause 0x1d ; 120-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v226 /*v994*/, off, s32 offset:3528 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v995*/, off, s32 offset:3532 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v996*/, off, s32 offset:3536 @@ -7860,8 +7860,9 @@ define amdgpu_cs void @call_from_entry(<8 x float> %x, ptr %p) { ; ; GFX1250-DAGISEL-LABEL: call_from_entry: ; GFX1250-DAGISEL: ; %bb.0: -; GFX1250-DAGISEL-NEXT: s_mov_b64 s[0:1], callee@abs64 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s32, 0 +; GFX1250-DAGISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GFX1250-DAGISEL-NEXT: s_mov_b64 s[0:1], callee@abs64 ; GFX1250-DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8 ; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1] ; GFX1250-DAGISEL-NEXT: flat_store_b32 v[40:41], v0 @@ -7882,7 +7883,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: s_mov_b32 s0, s33 ; DAGISEL-NEXT: s_mov_b32 s33, s32 ; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -7915,7 +7916,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -7948,7 +7949,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:256 ; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:260 ; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:264 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:268 ; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:272 ; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:276 @@ -7981,7 +7982,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:384 ; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:388 ; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:392 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:396 ; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:400 ; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:404 @@ -8014,7 +8015,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:512 ; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:516 ; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:520 -; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: s_clause 0xf ; 64-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:524 ; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:528 ; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:532 @@ -8032,7 +8033,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:580 ; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:584 ; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 -; DAGISEL-NEXT: s_clause 0x2 +; DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Spill ; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 ; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164 ; DAGISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168 @@ -8052,13 +8053,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1 ; DAGISEL-NEXT: v_readlane_b32 s4, v42, 0 ; DAGISEL-NEXT: v_readlane_b32 s0, v42, 3 -; DAGISEL-NEXT: s_clause 0x2 +; DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 ; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:164 ; DAGISEL-NEXT: scratch_load_b32 v41, off, s33 offset:168 ; DAGISEL-NEXT: s_mov_b32 s32, s33 ; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -8091,7 +8092,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -8124,7 +8125,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:256 ; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:260 ; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:264 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:268 ; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:272 ; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:276 @@ -8157,7 +8158,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:384 ; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:388 ; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:392 -; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:396 ; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:400 ; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:404 @@ -8190,7 +8191,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:512 ; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:516 ; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:520 -; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: s_clause 0xf ; 64-byte Folded Reload ; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:524 ; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:528 ; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:532 @@ -8223,7 +8224,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: s_mov_b32 s0, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_xor_saveexec_b32 s4, -1 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -8256,7 +8257,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -8289,7 +8290,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:256 ; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:260 ; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:264 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:268 ; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:272 ; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:276 @@ -8322,7 +8323,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:384 ; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:388 ; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:392 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:396 ; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:400 ; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:404 @@ -8355,7 +8356,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:512 ; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:516 ; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:520 -; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:524 ; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:528 ; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:532 @@ -8373,7 +8374,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:580 ; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:584 ; GISEL-NEXT: s_mov_b32 exec_lo, -1 -; GISEL-NEXT: s_clause 0x2 +; GISEL-NEXT: s_clause 0x2 ; 12-byte Folded Spill ; GISEL-NEXT: scratch_store_b32 off, v42, s33 ; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164 ; GISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168 @@ -8393,13 +8394,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: v_readlane_b32 s30, v42, 1 ; GISEL-NEXT: v_readlane_b32 s4, v42, 0 ; GISEL-NEXT: v_readlane_b32 s0, v42, 3 -; GISEL-NEXT: s_clause 0x2 +; GISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v42, off, s33 ; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:164 ; GISEL-NEXT: scratch_load_b32 v41, off, s33 offset:168 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -8432,7 +8433,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -8465,7 +8466,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:256 ; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:260 ; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:264 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:268 ; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:272 ; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:276 @@ -8498,7 +8499,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:384 ; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:388 ; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:392 -; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:396 ; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:400 ; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:404 @@ -8531,7 +8532,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:512 ; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:516 ; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:520 -; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:524 ; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:528 ; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:532 @@ -8564,7 +8565,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: s_mov_b32 s0, s33 ; DAGISEL64-NEXT: s_mov_b32 s33, s32 ; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -8597,7 +8598,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -8630,7 +8631,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:256 ; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:260 ; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:264 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:268 ; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:272 ; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:276 @@ -8663,7 +8664,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:384 ; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:388 ; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:392 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:396 ; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:400 ; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:404 @@ -8696,7 +8697,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:512 ; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:516 ; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:520 -; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: s_clause 0xf ; 64-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:524 ; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:528 ; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:532 @@ -8714,7 +8715,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:580 ; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:584 ; DAGISEL64-NEXT: s_mov_b64 exec, -1 -; DAGISEL64-NEXT: s_clause 0x2 +; DAGISEL64-NEXT: s_clause 0x2 ; 12-byte Folded Spill ; DAGISEL64-NEXT: scratch_store_b32 off, v42, s33 ; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 offset:164 ; DAGISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168 @@ -8737,13 +8738,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: v_readlane_b32 s5, v42, 1 ; DAGISEL64-NEXT: v_readlane_b32 s4, v42, 0 ; DAGISEL64-NEXT: v_readlane_b32 s0, v42, 4 -; DAGISEL64-NEXT: s_clause 0x2 +; DAGISEL64-NEXT: s_clause 0x2 ; 12-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v42, off, s33 ; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 offset:164 ; DAGISEL64-NEXT: scratch_load_b32 v41, off, s33 offset:168 ; DAGISEL64-NEXT: s_mov_b32 s32, s33 ; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -8776,7 +8777,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -8809,7 +8810,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:256 ; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:260 ; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:264 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:268 ; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:272 ; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:276 @@ -8842,7 +8843,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:384 ; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:388 ; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:392 -; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:396 ; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:400 ; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:404 @@ -8875,7 +8876,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:512 ; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:516 ; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:520 -; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: s_clause 0xf ; 64-byte Folded Reload ; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:524 ; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:528 ; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:532 @@ -8908,7 +8909,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: s_mov_b32 s0, s33 ; GISEL64-NEXT: s_mov_b32 s33, s32 ; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -8941,7 +8942,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 ; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 ; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 ; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 ; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 @@ -8974,7 +8975,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:256 ; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:260 ; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:264 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:268 ; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:272 ; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:276 @@ -9007,7 +9008,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:384 ; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:388 ; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:392 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:396 ; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:400 ; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:404 @@ -9040,7 +9041,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:512 ; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:516 ; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:520 -; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:524 ; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:528 ; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:532 @@ -9058,7 +9059,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:580 ; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:584 ; GISEL64-NEXT: s_mov_b64 exec, -1 -; GISEL64-NEXT: s_clause 0x2 +; GISEL64-NEXT: s_clause 0x2 ; 12-byte Folded Spill ; GISEL64-NEXT: scratch_store_b32 off, v42, s33 ; GISEL64-NEXT: scratch_store_b32 off, v40, s33 offset:164 ; GISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168 @@ -9081,13 +9082,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: v_readlane_b32 s5, v42, 1 ; GISEL64-NEXT: v_readlane_b32 s4, v42, 0 ; GISEL64-NEXT: v_readlane_b32 s0, v42, 4 -; GISEL64-NEXT: s_clause 0x2 +; GISEL64-NEXT: s_clause 0x2 ; 12-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v42, off, s33 ; GISEL64-NEXT: scratch_load_b32 v40, off, s33 offset:164 ; GISEL64-NEXT: scratch_load_b32 v41, off, s33 offset:168 ; GISEL64-NEXT: s_mov_b32 s32, s33 ; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -9120,7 +9121,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 ; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 ; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 ; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 ; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 @@ -9153,7 +9154,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:256 ; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:260 ; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:264 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:268 ; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:272 ; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:276 @@ -9186,7 +9187,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:384 ; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:388 ; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:392 -; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:396 ; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:400 ; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:404 @@ -9219,7 +9220,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:512 ; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:516 ; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:520 -; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:524 ; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:528 ; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:532 @@ -9249,7 +9250,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: s_mov_b32 s0, s33 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s32 ; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 @@ -9313,7 +9314,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:252 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:256 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:260 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:264 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:268 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:272 @@ -9377,7 +9378,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:504 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:508 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:512 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:516 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:520 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:524 @@ -9442,7 +9443,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v298*/, s33 offset:756 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v299*/, s33 offset:760 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v44 /*v300*/, s33 offset:764 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v45 /*v301*/, s33 offset:768 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v46 /*v302*/, s33 offset:772 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v47 /*v303*/, s33 offset:776 @@ -9506,7 +9507,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v361*/, s33 offset:1008 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v362*/, s33 offset:1012 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v107 /*v363*/, s33 offset:1016 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v108 /*v364*/, s33 offset:1020 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v109 /*v365*/, s33 offset:1024 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v110 /*v366*/, s33 offset:1028 @@ -9570,7 +9571,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v424*/, s33 offset:1260 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v425*/, s33 offset:1264 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v170 /*v426*/, s33 offset:1268 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v171 /*v427*/, s33 offset:1272 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v172 /*v428*/, s33 offset:1276 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v173 /*v429*/, s33 offset:1280 @@ -9634,7 +9635,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v487*/, s33 offset:1512 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v488*/, s33 offset:1516 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v233 /*v489*/, s33 offset:1520 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v234 /*v490*/, s33 offset:1524 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v235 /*v491*/, s33 offset:1528 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v236 /*v492*/, s33 offset:1532 @@ -9699,7 +9700,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v550*/, s33 offset:1764 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v551*/, s33 offset:1768 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40 /*v552*/, s33 offset:1772 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41 /*v553*/, s33 offset:1776 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v554*/, s33 offset:1780 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v555*/, s33 offset:1784 @@ -9763,7 +9764,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v613*/, s33 offset:2016 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v614*/, s33 offset:2020 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v103 /*v615*/, s33 offset:2024 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v104 /*v616*/, s33 offset:2028 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v617*/, s33 offset:2032 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v618*/, s33 offset:2036 @@ -9827,7 +9828,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v676*/, s33 offset:2268 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v677*/, s33 offset:2272 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v166 /*v678*/, s33 offset:2276 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v167 /*v679*/, s33 offset:2280 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v680*/, s33 offset:2284 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v681*/, s33 offset:2288 @@ -9891,7 +9892,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v739*/, s33 offset:2520 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v740*/, s33 offset:2524 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v229 /*v741*/, s33 offset:2528 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v230 /*v742*/, s33 offset:2532 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v743*/, s33 offset:2536 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v744*/, s33 offset:2540 @@ -9956,7 +9957,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v34 /*v802*/, s33 offset:2772 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v35 /*v803*/, s33 offset:2776 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v36 /*v804*/, s33 offset:2780 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v37 /*v805*/, s33 offset:2784 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v806*/, s33 offset:2788 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v807*/, s33 offset:2792 @@ -10020,7 +10021,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97 /*v865*/, s33 offset:3024 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v98 /*v866*/, s33 offset:3028 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v99 /*v867*/, s33 offset:3032 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v100 /*v868*/, s33 offset:3036 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v869*/, s33 offset:3040 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v870*/, s33 offset:3044 @@ -10084,7 +10085,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v160 /*v928*/, s33 offset:3276 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v161 /*v929*/, s33 offset:3280 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v162 /*v930*/, s33 offset:3284 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v163 /*v931*/, s33 offset:3288 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v932*/, s33 offset:3292 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v933*/, s33 offset:3296 @@ -10148,7 +10149,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v223 /*v991*/, s33 offset:3528 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224 /*v992*/, s33 offset:3532 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v225 /*v993*/, s33 offset:3536 -; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: s_clause 0x1d ; 120-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v226 /*v994*/, s33 offset:3540 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v995*/, s33 offset:3544 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v996*/, s33 offset:3548 @@ -10182,7 +10183,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 ; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-DAGISEL-NEXT: s_clause 0x2 +; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Spill ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42, s33 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168 @@ -10201,14 +10202,14 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v42, 1 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v42, 0 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s0, v42, 3 -; GFX1250-DAGISEL-NEXT: s_clause 0x2 +; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42, off, s33 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:164 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v41, off, s33 offset:168 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s32, s33 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 @@ -10272,7 +10273,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:252 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:256 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:260 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:264 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:268 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:272 @@ -10336,7 +10337,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:504 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:508 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:512 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:516 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:520 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:524 @@ -10401,7 +10402,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v298*/, off, s33 offset:756 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v299*/, off, s33 offset:760 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v44 /*v300*/, off, s33 offset:764 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v45 /*v301*/, off, s33 offset:768 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v46 /*v302*/, off, s33 offset:772 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v47 /*v303*/, off, s33 offset:776 @@ -10465,7 +10466,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v361*/, off, s33 offset:1008 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v362*/, off, s33 offset:1012 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v107 /*v363*/, off, s33 offset:1016 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v108 /*v364*/, off, s33 offset:1020 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v109 /*v365*/, off, s33 offset:1024 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v110 /*v366*/, off, s33 offset:1028 @@ -10529,7 +10530,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v424*/, off, s33 offset:1260 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v425*/, off, s33 offset:1264 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v170 /*v426*/, off, s33 offset:1268 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v171 /*v427*/, off, s33 offset:1272 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v172 /*v428*/, off, s33 offset:1276 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v173 /*v429*/, off, s33 offset:1280 @@ -10593,7 +10594,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v487*/, off, s33 offset:1512 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v488*/, off, s33 offset:1516 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v233 /*v489*/, off, s33 offset:1520 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v234 /*v490*/, off, s33 offset:1524 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v235 /*v491*/, off, s33 offset:1528 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v236 /*v492*/, off, s33 offset:1532 @@ -10658,7 +10659,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v550*/, off, s33 offset:1764 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v551*/, off, s33 offset:1768 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40 /*v552*/, off, s33 offset:1772 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v41 /*v553*/, off, s33 offset:1776 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v554*/, off, s33 offset:1780 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v555*/, off, s33 offset:1784 @@ -10722,7 +10723,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v613*/, off, s33 offset:2016 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v614*/, off, s33 offset:2020 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v103 /*v615*/, off, s33 offset:2024 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v104 /*v616*/, off, s33 offset:2028 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v617*/, off, s33 offset:2032 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v618*/, off, s33 offset:2036 @@ -10786,7 +10787,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v676*/, off, s33 offset:2268 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v677*/, off, s33 offset:2272 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v166 /*v678*/, off, s33 offset:2276 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v167 /*v679*/, off, s33 offset:2280 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v680*/, off, s33 offset:2284 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v681*/, off, s33 offset:2288 @@ -10850,7 +10851,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v739*/, off, s33 offset:2520 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v740*/, off, s33 offset:2524 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v229 /*v741*/, off, s33 offset:2528 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v230 /*v742*/, off, s33 offset:2532 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v743*/, off, s33 offset:2536 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v744*/, off, s33 offset:2540 @@ -10915,7 +10916,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v34 /*v802*/, off, s33 offset:2772 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v35 /*v803*/, off, s33 offset:2776 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v36 /*v804*/, off, s33 offset:2780 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v37 /*v805*/, off, s33 offset:2784 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v806*/, off, s33 offset:2788 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v807*/, off, s33 offset:2792 @@ -10979,7 +10980,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97 /*v865*/, off, s33 offset:3024 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v98 /*v866*/, off, s33 offset:3028 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v99 /*v867*/, off, s33 offset:3032 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v100 /*v868*/, off, s33 offset:3036 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v869*/, off, s33 offset:3040 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v870*/, off, s33 offset:3044 @@ -11043,7 +11044,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v160 /*v928*/, off, s33 offset:3276 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v161 /*v929*/, off, s33 offset:3280 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v162 /*v930*/, off, s33 offset:3284 -; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: s_clause 0x3e ; 252-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v163 /*v931*/, off, s33 offset:3288 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v932*/, off, s33 offset:3292 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v933*/, off, s33 offset:3296 @@ -11107,7 +11108,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v223 /*v991*/, off, s33 offset:3528 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224 /*v992*/, off, s33 offset:3532 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v225 /*v993*/, off, s33 offset:3536 -; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: s_clause 0x1d ; 120-byte Folded Reload ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v226 /*v994*/, off, s33 offset:3540 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v995*/, off, s33 offset:3544 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v996*/, off, s33 offset:3548 diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index 6636eb544343b..59968bc206ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -27,6 +27,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX1250-SDAG-LABEL: workgroup_id_x: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-SDAG-NEXT: s_bfe_u32 s2, ttmp6, 0x4000c ; GFX1250-SDAG-NEXT: s_and_b32 s3, ttmp6, 15 @@ -45,6 +46,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) { ; ; GFX1250-GISEL-LABEL: workgroup_id_x: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-GISEL-NEXT: s_bfe_u32 s2, ttmp6, 0x4000c ; GFX1250-GISEL-NEXT: s_and_b32 s3, ttmp6, 15 @@ -101,6 +103,7 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; ; GFX1250-SDAG-LABEL: workgroup_id_xy: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-SDAG-NEXT: s_bfe_u32 s6, ttmp6, 0x40010 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -128,6 +131,7 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace ; ; GFX1250-GISEL-LABEL: workgroup_id_xy: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s6, ttmp6, 0x4000c ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX1250-GISEL-NEXT: s_add_co_i32 s6, s6, 1 @@ -210,6 +214,7 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; ; GFX1250-SDAG-LABEL: workgroup_id_xyz: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-SDAG-NEXT: s_bfe_u32 s0, ttmp6, 0x40014 ; GFX1250-SDAG-NEXT: s_lshr_b32 s8, ttmp7, 16 ; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s0, 1 @@ -248,6 +253,7 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; ; GFX1250-GISEL-LABEL: workgroup_id_xyz: ; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp6, 15 ; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1