From c316332e1789221ec26875d1dc335382b6e68d83 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Thu, 6 Oct 2022 09:06:32 +0900 Subject: [PATCH] [Sink] Allow sinking of invariant loads across critical edges Invariant loads can always be sunk. Reviewed By: foad, arsenm Differential Revision: https://reviews.llvm.org/D135133 --- llvm/lib/Transforms/Scalar/Sink.cpp | 3 +- .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 118 +- .../GlobalISel/llvm.amdgcn.end.cf.i32.ll | 12 +- .../GlobalISel/llvm.amdgcn.end.cf.i64.ll | 7 +- .../AMDGPU/GlobalISel/non-entry-alloca.ll | 10 +- .../AMDGPU/atomic_optimizations_buffer.ll | 1396 ++++++++--------- .../atomic_optimizations_local_pointer.ll | 1273 +++++++-------- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 1164 +++++++------- .../atomic_optimizations_struct_buffer.ll | 1200 +++++++------- .../branch-relaxation-inst-size-gfx10.ll | 1 - llvm/test/CodeGen/AMDGPU/madak.ll | 3 +- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 2 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 81 +- llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll | 78 +- .../si-unify-exit-multiple-unreachables.ll | 3 + llvm/test/CodeGen/AMDGPU/srem64.ll | 85 +- .../AMDGPU/subreg-coalescer-undef-use.ll | 34 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 114 +- llvm/test/Transforms/Sink/invariant-load.ll | 29 + 19 files changed, 2733 insertions(+), 2880 deletions(-) create mode 100644 llvm/test/Transforms/Sink/invariant-load.ll diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp index e8fde53005f0b..dad45c47e0c24 100644 --- a/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/llvm/lib/Transforms/Scalar/Sink.cpp @@ -79,7 +79,8 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (Inst->mayReadFromMemory()) + if (Inst->mayReadFromMemory() && + !Inst->hasMetadata(LLVMContext::MD_invariant_load)) return false; // We don't want to sink across a critical edge if we don't dominate the diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 908874e073e73..99ef58147896c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1354,20 +1354,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7-NEXT: s_cbranch_execz .LBB13_2 ; GFX7-NEXT: ; %bb.1: ; %bb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x14 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x14 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_cmp_lg_u32 s0, 0 +; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_cselect_b32 s6, 1, 0 ; GFX7-NEXT: .LBB13_2: ; %exit ; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_mov_b32 s6, -1 @@ -1382,30 +1382,30 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB13_2 ; GFX8-NEXT: ; %bb.1: ; %bb -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x50 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s4, 0 +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: .LBB13_2: ; %exit -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_add_u32 s0, s2, 8 -; GFX8-NEXT: s_addc_u32 s1, s3, 0 -; GFX8-NEXT: s_and_b32 s2, 1, s6 +; GFX8-NEXT: s_add_u32 s0, s0, 8 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_and_b32 s2, 1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_nop 2 @@ -1420,29 +1420,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 ; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10_W32-NEXT: s_mov_b32 s5, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] ; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX10_W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10_W32-NEXT: s_mov_b32 s2, 0 +; GFX10_W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX10_W32-NEXT: ; %bb.1: ; %bb -; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x50 +; GFX10_W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10_W32-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10_W32-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10_W32-NEXT: .LBB13_2: ; %exit -; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10_W32-NEXT: s_and_b32 s0, 1, s5 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] offset:8 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8 ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: @@ -1450,58 +1450,58 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 ; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10_W64-NEXT: s_mov_b32 s6, 0 +; GFX10_W64-NEXT: s_mov_b32 s4, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] ; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2 ; GFX10_W64-NEXT: ; %bb.1: ; %bb -; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x50 +; GFX10_W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10_W64-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10_W64-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10_W64-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10_W64-NEXT: .LBB13_2: ; %exit -; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10_W64-NEXT: s_and_b32 s2, 1, s4 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] offset:8 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8 ; GFX10_W64-NEXT: s_endpgm ; ; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX11_W32: ; %bb.0: ; %entry ; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 ; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX11_W32-NEXT: s_mov_b32 s5, 0 -; GFX11_W32-NEXT: s_mov_b32 s4, exec_lo ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: global_load_b96 v[1:3], v1, s[2:3] -; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11_W32-NEXT: s_mov_b32 s2, 0 +; GFX11_W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2 ; GFX11_W32-NEXT: ; %bb.1: ; %bb -; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x50 +; GFX11_W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x50 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11_W32-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11_W32-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11_W32-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11_W32-NEXT: s_cselect_b32 s2, 1, 0 ; GFX11_W32-NEXT: .LBB13_2: ; %exit -; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX11_W32-NEXT: s_and_b32 s0, 1, s5 -; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX11_W32-NEXT: s_waitcnt vmcnt(0) ; GFX11_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W32-NEXT: global_store_b32 v1, v0, s[2:3] offset:8 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -1509,29 +1509,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX11_W64: ; %bb.0: ; %entry ; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 ; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX11_W64-NEXT: s_mov_b32 s6, 0 -; GFX11_W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11_W64-NEXT: s_mov_b32 s4, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: global_load_b96 v[1:3], v1, s[2:3] -; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2 ; GFX11_W64-NEXT: ; %bb.1: ; %bb -; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x50 +; GFX11_W64-NEXT: s_load_b64 s[4:5], s[0:1], 0x50 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11_W64-NEXT: s_load_b32 s4, s[4:5], 0x0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11_W64-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11_W64-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11_W64-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11_W64-NEXT: .LBB13_2: ; %exit -; GFX11_W64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX11_W64-NEXT: s_and_b32 s0, 1, s6 -; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_and_b32 s2, 1, s4 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX11_W64-NEXT: s_waitcnt vmcnt(0) ; GFX11_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11_W64-NEXT: global_store_b32 v1, v0, s[2:3] offset:8 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll index e4dfe024568a3..fd2251bff0b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -5,17 +5,17 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX10-LABEL: test_wave32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 ; GFX10-NEXT: ; %bb.1: ; %mid ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB0_2: ; %bb +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -25,9 +25,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; ; GFX11-LABEL: test_wave32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 @@ -36,6 +34,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB0_2: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll index af7e551cbb1bd..1e39c6395ca7d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll @@ -4,16 +4,17 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) { ; GCN-LABEL: test_wave64: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %mid ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %bb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index ca6df8ccf4b1f..bf0758a645eba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -27,14 +27,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dword s6, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s7, s32, 0x1000 +; GCN-NEXT: s_load_dword s7, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 2 -; GCN-NEXT: s_add_u32 s6, s7, s6 +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_add_u32 s6, s6, s7 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 0d1cb169eaee2..c7bd302850671 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -17,200 +17,200 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 imm define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; GFX6-NEXT: s_mul_i32 s0, s0, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_mul_i32 s4, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 +; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX8-NEXT: s_mul_i32 s0, s0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_mov_b32 s5, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 -; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -222,159 +222,155 @@ entry: define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s0, s8, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX6-NEXT: s_mul_i32 s4, s6, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] +; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 -; GFX10W32-NEXT: s_mov_b32 s6, exec_lo +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -382,54 +378,54 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 -; GFX11W32-NEXT: s_mov_b32 s6, exec_lo -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -453,17 +449,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -475,44 +470,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: flat_store_dword v[3:4], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -524,28 +519,29 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: @@ -566,43 +562,41 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 ; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -613,44 +607,42 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W32-NEXT: s_mov_b32 s5, s6 +; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -676,46 +668,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -726,8 +716,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -735,42 +726,39 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -797,17 +785,16 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -819,46 +806,46 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB3_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dword s7, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB3_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: flat_store_dword v[3:4], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -870,30 +857,31 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: @@ -914,46 +902,44 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 ; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dword s7, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v4, s7 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s5 ; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB3_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: @@ -964,47 +950,45 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s5, s6 +; GFX10W32-NEXT: s_mov_b32 s3, s4 ; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dword s6, s[0:1], 0x44 -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 -; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W32-NEXT: v_mov_b32_e32 v4, s8 +; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB3_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: @@ -1030,49 +1014,47 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b32 s7, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v4, s7 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s5 ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB3_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1083,8 +1065,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1092,45 +1075,42 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_mov_b32 s3, s4 ; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b32 s6, s[0:1], 0x44 -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: v_mov_b32_e32 v4, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB3_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1213,207 +1193,207 @@ entry: define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; GFX6-NEXT: s_mul_i32 s0, s0, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_mul_i32 s4, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX8-NEXT: s_mul_i32 s0, s0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_mov_b32 s5, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 -; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1425,161 +1405,157 @@ entry: define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s0, s8, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX6-NEXT: s_mul_i32 s4, s6, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 -; GFX10W32-NEXT: s_mov_b32 s6, exec_lo +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1587,56 +1563,56 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 -; GFX11W32-NEXT: s_mov_b32 s6, exec_lo -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1660,17 +1636,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1682,44 +1657,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: flat_store_dword v[3:4], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1731,28 +1706,29 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: @@ -1773,43 +1749,41 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 ; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB7_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1820,44 +1794,42 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W32-NEXT: s_mov_b32 s5, s6 +; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB7_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1883,46 +1855,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB7_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1933,8 +1903,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1942,42 +1913,39 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 94dfe2325c5fe..ea9022c83c4dc 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -19,107 +19,104 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB0_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s2, s2, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -127,7 +124,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -138,7 +136,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; ; GFX1032-LABEL: add_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 @@ -158,6 +155,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -168,28 +166,28 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; ; GFX1164-LABEL: add_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s2, s2, 5 +; GFX1164-NEXT: s_mul_i32 s4, s4, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB0_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -202,7 +200,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; ; GFX1132-LABEL: add_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 @@ -222,6 +219,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB0_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -242,117 +240,115 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 +; GFX7LESS-NEXT: s_mul_i32 s4, s6, s4 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: s_mul_i32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -360,32 +356,31 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: s_mul_i32 s4, s2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -393,84 +388,83 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB1_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164-NEXT: s_mul_i32 s4, s6, s4 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB1_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1] -; GFX1164-NEXT: s_mov_b32 s6, -1 -; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mul_i32 s4, s2, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB1_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] -; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -497,7 +491,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] @@ -536,19 +529,18 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB2_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] @@ -586,13 +578,13 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -614,11 +606,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -649,6 +638,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB2_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 @@ -665,16 +655,13 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -699,6 +686,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB2_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 @@ -730,12 +718,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -767,6 +752,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -784,8 +770,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -793,22 +780,17 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -824,6 +806,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1075,7 +1058,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1093,22 +1075,22 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc -; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -1130,18 +1112,19 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -1162,18 +1145,19 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -1194,6 +1178,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] @@ -1205,7 +1190,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 @@ -1225,6 +1209,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] @@ -1236,7 +1221,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 @@ -1258,6 +1242,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1271,7 +1256,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 @@ -1292,6 +1276,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1677,110 +1662,107 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s2, s2, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1788,7 +1770,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -1800,7 +1783,6 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; ; GFX1032-LABEL: sub_i32_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 @@ -1820,6 +1802,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -1831,28 +1814,28 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; ; GFX1164-LABEL: sub_i32_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s2, s2, 5 +; GFX1164-NEXT: s_mul_i32 s4, s4, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB7_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 @@ -1866,7 +1849,6 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; ; GFX1132-LABEL: sub_i32_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 @@ -1886,6 +1868,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB7_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 @@ -1907,117 +1890,115 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec -; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 +; GFX7LESS-NEXT: s_mul_i32 s4, s6, s4 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB8_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s2, s6, s2 +; GFX8-NEXT: s_mul_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s2, s6, s2 +; GFX9-NEXT: s_mul_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: s_mul_i32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -2025,33 +2006,32 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_uniform: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 -; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: s_mul_i32 s4, s2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -2059,87 +2039,86 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_uniform: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_clause 0x1 -; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164-NEXT: s_mul_i32 s4, s6, s4 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_uniform: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_clause 0x1 -; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 -; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c -; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mul_i32 s4, s2, s4 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -2166,7 +2145,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] @@ -2205,19 +2183,18 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] @@ -2255,13 +2232,13 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2283,11 +2260,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -2318,6 +2292,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 @@ -2334,16 +2309,13 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2368,6 +2340,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 @@ -2399,12 +2372,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -2436,6 +2406,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2453,8 +2424,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -2462,22 +2434,17 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -2493,6 +2460,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2744,7 +2712,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -2762,22 +2729,22 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -2796,22 +2763,22 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -2829,22 +2796,22 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 @@ -2865,6 +2832,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -2879,7 +2847,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 @@ -2899,6 +2866,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -2913,7 +2881,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 @@ -2935,6 +2902,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 @@ -2951,7 +2919,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 @@ -2972,6 +2939,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -3386,7 +3354,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -3425,19 +3392,18 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB14_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -3475,13 +3441,13 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB14_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3503,11 +3469,8 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -3538,6 +3501,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB14_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 @@ -3554,16 +3518,13 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3588,6 +3549,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB14_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 @@ -3619,12 +3581,9 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -3656,6 +3615,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB14_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3673,8 +3633,9 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -3682,22 +3643,17 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -3713,6 +3669,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB14_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3747,7 +3704,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] @@ -3786,19 +3742,18 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] @@ -3836,13 +3791,13 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3864,11 +3819,8 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -3899,6 +3851,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB15_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 @@ -3915,16 +3868,13 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3949,6 +3899,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB15_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 @@ -3980,12 +3931,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -4017,6 +3965,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB15_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4034,8 +3983,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4043,22 +3993,17 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -4074,6 +4019,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB15_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4108,7 +4054,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] @@ -4147,19 +4092,18 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] @@ -4197,13 +4141,13 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4225,11 +4169,8 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -4260,6 +4201,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB16_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 @@ -4276,16 +4218,13 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4310,6 +4249,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB16_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 @@ -4341,12 +4281,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -4378,6 +4315,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB16_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4395,8 +4333,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -4404,22 +4343,17 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -4435,6 +4369,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB16_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4469,7 +4404,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -4508,19 +4442,18 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB17_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_max_i32_e32 v0, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -4558,13 +4491,13 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB17_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_max_i32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4586,11 +4519,8 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -4621,6 +4551,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB17_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 @@ -4637,16 +4568,13 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4671,6 +4599,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB17_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 @@ -4702,12 +4631,9 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -4739,6 +4665,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB17_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4756,8 +4683,9 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -4765,22 +4693,17 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -4796,6 +4719,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB17_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -4817,7 +4741,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; ; GFX7LESS-LABEL: max_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4834,25 +4757,25 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB18_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 ; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: max_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4870,24 +4793,25 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: .LBB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: max_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4904,24 +4828,25 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: .LBB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: max_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -4940,6 +4865,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -4955,7 +4881,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1032-LABEL: max_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4973,6 +4898,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -4988,7 +4914,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1164-LABEL: max_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -5007,6 +4932,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB18_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc @@ -5024,7 +4950,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1132-LABEL: max_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -5041,6 +4966,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB18_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo @@ -5079,7 +5005,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -5118,19 +5043,18 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB19_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_min_i32_e32 v0, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -5168,13 +5092,13 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB19_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_min_i32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -5196,11 +5120,8 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -5231,6 +5152,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB19_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 @@ -5247,16 +5169,13 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5281,6 +5200,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB19_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 @@ -5312,12 +5232,9 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -5349,6 +5266,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB19_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5366,8 +5284,9 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -5375,22 +5294,17 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -5406,6 +5320,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB19_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5427,7 +5342,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; ; GFX7LESS-LABEL: min_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -5444,25 +5358,25 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB20_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 ; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: min_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -5482,6 +5396,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -5489,15 +5404,15 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: min_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -5516,6 +5431,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc @@ -5523,15 +5439,15 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: min_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -5550,6 +5466,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -5557,15 +5474,14 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: min_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5583,6 +5499,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -5590,15 +5507,14 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: min_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -5617,6 +5533,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB20_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc @@ -5625,8 +5542,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5634,7 +5551,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1132-LABEL: min_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -5651,6 +5567,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB20_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo @@ -5659,8 +5576,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5689,7 +5606,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] @@ -5728,19 +5644,18 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_max_u32_e32 v0, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] @@ -5778,13 +5693,13 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_max_u32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -5806,11 +5721,8 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -5841,6 +5753,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB21_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 @@ -5857,16 +5770,13 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5891,6 +5801,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB21_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 @@ -5922,12 +5833,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -5959,6 +5867,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB21_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5976,8 +5885,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -5985,22 +5895,17 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -6016,6 +5921,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB21_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6037,7 +5943,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; ; GFX7LESS-LABEL: umax_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6054,24 +5959,24 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: umax_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6088,24 +5993,24 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umax_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6121,24 +6026,24 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umax_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6157,6 +6062,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 @@ -6172,7 +6078,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1032-LABEL: umax_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6190,6 +6095,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 @@ -6205,7 +6111,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1164-LABEL: umax_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -6224,6 +6129,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB22_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0 @@ -6241,7 +6147,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1132-LABEL: umax_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -6258,6 +6163,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB22_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_mov_b32_e32 v1, 0 @@ -6296,7 +6202,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -6335,19 +6240,18 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB23_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -6385,13 +6289,13 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB23_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -6413,11 +6317,8 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] @@ -6448,6 +6349,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB23_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 @@ -6464,16 +6366,13 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 ; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf @@ -6498,6 +6397,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB23_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 @@ -6529,12 +6429,9 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 ; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 ; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] @@ -6566,6 +6463,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB23_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6583,8 +6481,9 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf @@ -6592,22 +6491,17 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v2, v1 ; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 ; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 @@ -6623,6 +6517,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB23_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6644,7 +6539,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; ; GFX7LESS-LABEL: umin_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6661,10 +6555,12 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB24_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 @@ -6672,13 +6568,11 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: umin_i64_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6695,6 +6589,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -6704,15 +6599,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umin_i64_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6728,6 +6622,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -6737,15 +6632,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umin_i64_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 @@ -6764,6 +6658,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: .LBB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -6771,15 +6666,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umin_i64_constant: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6797,6 +6691,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: .LBB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -6804,15 +6699,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umin_i64_constant: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -6831,6 +6725,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB24_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -6839,8 +6734,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc ; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc -; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6848,7 +6743,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; ; GFX1132-LABEL: umin_i64_constant: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -6865,6 +6759,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB24_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo @@ -6873,8 +6768,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] ; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo ; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 63ca1eb59748d..7e4e430082c49 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -16,200 +16,200 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; GFX6-NEXT: s_mul_i32 s0, s0, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_mul_i32 s4, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB0_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 +; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX8-NEXT: s_mul_i32 s0, s0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_mov_b32 s5, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 -; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -221,159 +221,155 @@ entry: define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s0, s8, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX6-NEXT: s_mul_i32 s4, s6, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] +; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 -; GFX10W32-NEXT: s_mov_b32 s6, exec_lo +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -381,54 +377,54 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 -; GFX11W32-NEXT: s_mov_b32 s6, exec_lo -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -452,17 +448,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -474,44 +469,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB2_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: flat_store_dword v[3:4], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -523,28 +518,29 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: @@ -565,43 +561,41 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 ; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -612,44 +606,42 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W32-NEXT: s_mov_b32 s5, s6 +; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -675,46 +667,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -725,8 +715,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -734,42 +725,39 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -852,207 +840,207 @@ entry: define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB4_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; GFX6-NEXT: s_mul_i32 s0, s0, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_mul_i32 s4, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB4_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX8-NEXT: s_mul_i32 s0, s0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB4_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_mov_b32 s5, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 -; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB4_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB4_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB4_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1064,161 +1052,157 @@ entry: define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s0, s8, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX6-NEXT: s_mul_i32 s4, s6, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX6-NEXT: .LBB5_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 -; GFX10W32-NEXT: s_mov_b32 s6, exec_lo +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1226,56 +1210,56 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 -; GFX11W32-NEXT: s_mov_b32 s6, exec_lo -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1299,17 +1283,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1321,44 +1304,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: flat_store_dword v[3:4], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1370,28 +1353,29 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: @@ -1412,43 +1396,41 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 ; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1459,44 +1441,42 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W32-NEXT: s_mov_b32 s5, s6 +; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1522,46 +1502,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1572,8 +1550,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1581,42 +1560,39 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index f1fba74ef1b62..4214db631135f 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -16,207 +16,207 @@ declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { ; GFX6-LABEL: add_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB0_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX6-NEXT: s_mul_i32 s0, s0, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_mul_i32 s4, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB0_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 +; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX8-NEXT: s_mul_i32 s0, s0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB0_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_mov_b32 s5, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB0_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB0_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB0_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -228,221 +228,217 @@ entry: define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { ; GFX6-LABEL: add_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s0, s8, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_mul_i32 s4, s6, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 -; GFX6-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc +; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB1_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc +; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc +; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc +; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] +; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3] ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 -; GFX10W32-NEXT: s_mov_b32 s6, exec_lo +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[12:15], 0 idxen glc +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB1_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 -; GFX11W32-NEXT: s_mov_b32 s6, exec_lo -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB1_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -467,17 +463,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -489,45 +484,45 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB2_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: flat_store_dword v[3:4], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -539,29 +534,30 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: @@ -582,44 +578,42 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 ; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -630,45 +624,43 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -694,47 +686,45 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB2_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -745,8 +735,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -754,43 +745,40 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -975,214 +963,214 @@ entry: define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { ; GFX6-LABEL: sub_i32_constant: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 ; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB5_2 ; GFX6-NEXT: ; %bb.1: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX6-NEXT: s_mul_i32 s0, s0, 5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX6-NEXT: s_mul_i32 s4, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB5_2: ; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX8-NEXT: s_mul_i32 s0, s0, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_constant: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] -; GFX9-NEXT: s_mul_i32 s0, s0, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX9-NEXT: s_mul_i32 s4, s4, 5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_constant: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W64-NEXT: s_mul_i32 s4, s4, 5 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB5_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_constant: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_mov_b32 s5, exec_lo +; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s3, s3, 5 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB5_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_constant: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: s_mul_i32 s4, s4, 5 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB5_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_constant: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: s_mov_b32 s4, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: s_mov_b32 s2, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: s_mul_i32 s3, s3, 5 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB5_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1194,225 +1182,221 @@ entry: define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { ; GFX6-LABEL: sub_i32_uniform: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[2:3], exec -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ; implicit-def: $vgpr1 -; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX6-NEXT: s_cbranch_execz .LBB6_2 ; GFX6-NEXT: ; %bb.1: -; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd -; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mul_i32 s0, s8, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_mul_i32 s4, s6, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 -; GFX6-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc +; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX6-NEXT: .LBB6_2: -; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s0, s8, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_mul_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc +; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB6_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX9-NEXT: s_mov_b64 s[4:5], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mul_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc +; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB6_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_uniform: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10W64-NEXT: s_mov_b64 s[6:7], exec +; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W64-NEXT: s_mov_b64 s[4:5], exec ; GFX10W64-NEXT: ; implicit-def: $vgpr1 -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: -; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 -; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 -; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc +; GFX10W64-NEXT: s_mul_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_uniform: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 -; GFX10W32-NEXT: s_mov_b32 s6, exec_lo +; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44 +; GFX10W32-NEXT: s_mov_b32 s4, exec_lo ; GFX10W32-NEXT: ; implicit-def: $vgpr1 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 -; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX10W32-NEXT: s_mul_i32 s4, s2, s4 +; GFX10W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_uniform: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 -; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W64-NEXT: s_mov_b64 s[4:5], exec -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 -; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_mul_i32 s4, s6, s4 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[12:15], 0 idxen glc +; GFX11W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB6_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_uniform: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 -; GFX11W32-NEXT: s_mov_b32 s6, exec_lo -; GFX11W32-NEXT: s_mov_b32 s5, exec_lo -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_mul_i32 s4, s2, s4 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1437,17 +1421,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1459,45 +1442,45 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: flat_store_dword v[3:4], v0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1509,29 +1492,30 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB7_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: @@ -1552,44 +1536,42 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 ; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 -; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB7_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1600,45 +1582,43 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 ; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 -; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB7_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1664,47 +1644,45 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 +; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] ; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB7_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1715,8 +1693,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -1724,43 +1703,40 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 ; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 ; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll index d0eecc6973e77..2f98eed6da872 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll @@ -6,7 +6,6 @@ ; instruction, rather than 8 in previous generations. ; GCN-LABEL: {{^}}long_forward_branch_gfx10only: -; GFX9: s_load_dwordx2 ; GFX9: s_cmp_eq_u32 ; GFX9-NEXT: s_cbranch_scc1 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 6af32a90d8527..e9b6c7f8d5fb4 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -244,9 +244,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia ; because the implicit immediate already uses the constant bus. ; On GFX10+ we can use two scalar operands. ; GCN-LABEL: {{^}}madak_constant_bus_violation: -; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} - ; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] +; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} ; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000 ; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 ; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index b180df0782823..9df93bc970a96 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -65,8 +65,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 ; FLATSCR-NEXT: s_mov_b32 s32, s2 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index c2454772ebfaf..4529dc5f1d213 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -970,31 +970,30 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_sext_i32_i16 s1, s1 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 16 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[6:7], 16 -; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 -; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] +; GCN-IR-NEXT: s_mov_b32 s5, s4 +; GCN-IR-NEXT: s_sub_u32 s12, s6, s2 +; GCN-IR-NEXT: s_subb_u32 s13, s7, s2 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 @@ -1003,28 +1002,28 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13 ; GCN-IR-NEXT: s_min_u32 s18, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s10, s14, s18 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[10:11], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[20:21], s[16:17], s[20:21] -; GCN-IR-NEXT: s_and_b64 s[16:17], s[20:21], exec -; GCN-IR-NEXT: s_cselect_b32 s17, 0, s13 -; GCN-IR-NEXT: s_cselect_b32 s16, 0, s12 +; GCN-IR-NEXT: s_sub_u32 s16, s14, s18 +; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[16:17], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 +; GCN-IR-NEXT: s_or_b64 s[20:21], s[10:11], s[20:21] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[20:21], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 ; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s11, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 +; GCN-IR-NEXT: s_add_u32 s20, s16, 1 +; GCN-IR-NEXT: s_addc_u32 s21, s17, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0 +; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s16 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20 ; GCN-IR-NEXT: s_add_u32 s19, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] @@ -1055,19 +1054,21 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 ; GCN-IR-NEXT: .LBB9_4: ; %Flow3 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[16:17], s[0:1] +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s14, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 -; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4 ; GCN-IR-NEXT: s_waitcnt expcnt(0) ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index bf9041c15969b..fce79701b8b0e 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,26 +11,27 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s2, s11, s0 +; SI-NEXT: s_add_i32 s7, s7, s2 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if -; SI-NEXT: s_sub_i32 s2, s9, s10 +; SI-NEXT: s_sub_i32 s7, s5, s6 ; SI-NEXT: .LBB0_3: ; %endif -; SI-NEXT: s_add_i32 s0, s2, s8 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_add_i32 s4, s7, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB0_4: -; SI-NEXT: ; implicit-def: $sgpr2 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB0_2 entry: @@ -55,31 +56,32 @@ endif: define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s6, s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s4, s[0:1], 0x13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s6, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB1_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dword s2, s[0:1], 0x2e ; SI-NEXT: s_load_dword s3, s[0:1], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s7, s2, s3 +; SI-NEXT: s_add_i32 s5, s2, s3 ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: .LBB1_2: ; %if ; SI-NEXT: s_load_dword s2, s[0:1], 0x1c -; SI-NEXT: s_load_dword s0, s[0:1], 0x25 +; SI-NEXT: s_load_dword s3, s[0:1], 0x25 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s7, s2, s0 +; SI-NEXT: s_add_i32 s5, s2, s3 ; SI-NEXT: .LBB1_3: ; %endif -; SI-NEXT: s_add_i32 s0, s7, s6 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_add_i32 s4, s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB1_4: -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: s_branch .LBB1_2 entry: @@ -106,30 +108,32 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xc ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SI-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s8, s2, s3 +; SI-NEXT: s_add_i32 s8, s6, s7 ; SI-NEXT: .LBB2_2: ; %Flow -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[2:3], s[6:7] +; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_xor_b64 exec, exec, s[2:3] +; SI-NEXT: s_cbranch_execz .LBB2_4 ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_add_i32 s0, s0, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ; %bb.4: ; %endif +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB2_4: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 6de25193fc2a5..9af9894110c07 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -29,6 +29,9 @@ define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n) ; UNIFY-NEXT: call void @llvm.trap() ; UNIFY-NEXT: br label %UnifiedUnreachableBlock ; UNIFY-LABEL: if.end6.sink.split: +; UNIFY-NEXT: %x.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %kernel.kernarg.segment, i64 8 +; UNIFY-NEXT: %x.kernarg.offset.cast = bitcast i8 addrspace(4)* %x.kernarg.offset to i32 addrspace(1)* addrspace(4)* +; UNIFY-NEXT: %x.load = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %x.kernarg.offset.cast, align 8, !invariant.load !0 ; UNIFY-NEXT: %idxprom = sext i32 %tid to i64 ; UNIFY-NEXT: %x1 = getelementptr inbounds i32, i32 addrspace(1)* %x.load, i64 %idxprom ; UNIFY-NEXT: store i32 %a.load, i32 addrspace(1)* %x1, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index f3ee132041722..8fdf6d1683ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1157,38 +1157,37 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_sext_i32_i16 s1, s1 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[0:1], 16 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s12, s7, 31 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 16 -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[6:7], 16 -; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[8:9], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 -; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 -; GCN-IR-NEXT: s_sub_u32 s6, s6, s12 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s12 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 +; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 +; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 +; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 -; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 ; GCN-IR-NEXT: s_min_u32 s12, s8, s9 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s4 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s5 ; GCN-IR-NEXT: s_min_u32 s16, s8, s9 ; GCN-IR-NEXT: s_sub_u32 s14, s12, s16 ; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 @@ -1196,8 +1195,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19] ; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s5 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s4 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] @@ -1208,10 +1207,10 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 ; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s14 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s18 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s18 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] @@ -1246,24 +1245,26 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: .LBB9_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-IR-NEXT: s_mul_i32 s8, s6, s11 -; GCN-IR-NEXT: s_mul_i32 s7, s7, s10 -; GCN-IR-NEXT: s_mul_i32 s6, s6, s10 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s8, v0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s7, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s6, s10 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s2, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s3, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 +; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s14, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index f652331362659..62eea86ed8f75 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -7,28 +7,28 @@ target triple="amdgcn--" define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { ; CHECK-LABEL: foobar: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_mov_b32 s6, -1 +; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: v_mov_b32_e32 v3, s3 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.1: ; %ift -; CHECK-NEXT: s_mov_b32 s0, s1 -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: s_mov_b32 s4, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: ; %bb.2: ; %ife -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_mov_b32 s7, 0xf000 -; CHECK-NEXT: s_mov_b32 s4, s2 -; CHECK-NEXT: s_mov_b32 s5, s3 -; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; CHECK-NEXT: s_endpgm ; FIXME: The change related to the fact that diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 9c99ef27a0748..3c3c77b1a5331 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -783,90 +783,90 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s3, s7, 0xffff -; GCN-IR-NEXT: s_and_b32 s2, s6, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: s_and_b32 s0, s0, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s5, s7, 0xffff +; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 ; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 -; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[0:1], 24 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0 +; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[2:3], s[6:7] -; GCN-IR-NEXT: s_flbit_i32_b32 s2, s0 -; GCN-IR-NEXT: s_add_i32 s2, s2, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s3, s1 -; GCN-IR-NEXT: s_min_u32 s10, s2, s3 -; GCN-IR-NEXT: s_flbit_i32_b32 s2, s8 -; GCN-IR-NEXT: s_add_i32 s2, s2, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s3, s9 -; GCN-IR-NEXT: s_min_u32 s14, s2, s3 -; GCN-IR-NEXT: s_sub_u32 s6, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[6:7], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[6:7], 63 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[12:13], s[16:17] -; GCN-IR-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GCN-IR-NEXT: s_cselect_b32 s13, 0, s9 -; GCN-IR-NEXT: s_cselect_b32 s12, 0, s8 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2 +; GCN-IR-NEXT: s_add_i32 s4, s4, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3 +; GCN-IR-NEXT: s_min_u32 s10, s4, s5 +; GCN-IR-NEXT: s_flbit_i32_b32 s4, s8 +; GCN-IR-NEXT: s_add_i32 s4, s4, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s5, s9 +; GCN-IR-NEXT: s_min_u32 s14, s4, s5 +; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[6:7], s[16:17] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9 +; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s6, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 +; GCN-IR-NEXT: s_add_u32 s16, s12, 1 +; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[16:17], 0 +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s12 -; GCN-IR-NEXT: s_add_u32 s15, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s16, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s2, s14 -; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s16 +; GCN-IR-NEXT: s_add_u32 s15, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s3, -1 +; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] +; GCN-IR-NEXT: s_add_u32 s8, s4, s14 +; GCN-IR-NEXT: s_addc_u32 s9, s5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s2, s15, s12 -; GCN-IR-NEXT: s_subb_u32 s2, s16, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 +; GCN-IR-NEXT: s_sub_u32 s4, s15, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s16, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s2, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[0:1] +; GCN-IR-NEXT: s_and_b32 s4, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow3 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[2:3], s[0:1] +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB7_5: ; %udiv-end -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 -; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: s_waitcnt expcnt(0) -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s6 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 diff --git a/llvm/test/Transforms/Sink/invariant-load.ll b/llvm/test/Transforms/Sink/invariant-load.ll new file mode 100644 index 0000000000000..0e5ab2fbf7fef --- /dev/null +++ b/llvm/test/Transforms/Sink/invariant-load.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -sink -S < %s | FileCheck %s + +; Loads marked invariant can be sunk across critical edges + +define <4 x float> @invariant_load(<4 x float> *%in, i32 %s) { +; CHECK-LABEL: @invariant_load( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[S:%.*]], 0 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = add i32 [[S]], 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = load <4 x float>, <4 x float>* [[IN:%.*]], align 16, !invariant.load !0 +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = load <4 x float>, <4 x float> *%in, !invariant.load !0 + %c = icmp eq i32 %s, 0 + br i1 %c, label %block, label %end +block: + %z = add i32 %s, 1 + br label %end +end: + ret <4 x float> %v +} + +!0 = !{}