diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 736744569a3bd..38331b614bceb 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -710,7 +710,8 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { // Verify the register is compatible with the operand. if (const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) { - const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg()); + const TargetRegisterClass *NewRC = + TRI->getRegClassForReg(*MRI, New->getReg()); const TargetRegisterClass *ConstrainRC = TRI->findCommonRegClass(OpRC, Old.getSubReg(), NewRC, New->getSubReg()); if (!ConstrainRC) @@ -727,8 +728,12 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) Old.setSubReg(AMDGPU::NoSubRegister); - Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); - Old.setIsUndef(New->isUndef()); + if (New->getReg().isPhysical()) { + Old.substPhysReg(New->getReg(), *TRI); + } else { + Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); + Old.setIsUndef(New->isUndef()); + } return true; } @@ -1997,7 +2002,9 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( if (!FoldingImm && !OpToFold.isReg()) return false; - if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) + // Fold virtual registers and constant physical registers. + if (OpToFold.isReg() && OpToFold.getReg().isPhysical() && + !TRI->isConstantPhysReg(OpToFold.getReg())) return false; // Prevent folding operands backwards in the function. For example, diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll index 153898560fc31..aac499f2fc602 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll @@ -27,20 +27,20 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0 -; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_cselect_b32 s1, 1, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_and_b32 s1, 1, s1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo -; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo ; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS @@ -69,14 +69,13 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa ; ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 082050877e0bb..605026614c614 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -586,14 +586,14 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -618,10 +618,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -770,9 +770,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -780,7 +780,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -805,10 +805,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -953,15 +953,15 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 @@ -982,10 +982,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1104,18 +1104,18 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 @@ -1136,10 +1136,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1445,14 +1445,14 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1477,10 +1477,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1633,9 +1633,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -1643,7 +1643,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1668,10 +1668,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1823,15 +1823,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 @@ -1852,10 +1852,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1988,18 +1988,18 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 @@ -2020,10 +2020,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2340,14 +2340,14 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2372,10 +2372,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2530,9 +2530,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -2540,7 +2540,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2565,10 +2565,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2722,15 +2722,15 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 @@ -2751,10 +2751,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2889,18 +2889,18 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 @@ -2921,10 +2921,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3244,14 +3244,14 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3276,10 +3276,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3434,9 +3434,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3444,7 +3444,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3469,10 +3469,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3626,15 +3626,15 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 @@ -3655,10 +3655,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3793,18 +3793,18 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 @@ -3825,10 +3825,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4147,14 +4147,14 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4179,10 +4179,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4337,9 +4337,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4347,7 +4347,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4372,10 +4372,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4529,15 +4529,15 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 @@ -4558,10 +4558,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4696,18 +4696,18 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 @@ -4728,10 +4728,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5050,14 +5050,14 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5082,10 +5082,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5240,9 +5240,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -5250,7 +5250,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5275,10 +5275,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5432,15 +5432,15 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 @@ -5461,10 +5461,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5599,18 +5599,18 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 @@ -5631,10 +5631,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5918,14 +5918,14 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5946,10 +5946,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6100,9 +6100,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -6110,7 +6110,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6131,10 +6131,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6285,15 +6285,15 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 @@ -6311,10 +6311,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6444,18 +6444,18 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 @@ -6473,10 +6473,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6759,14 +6759,14 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6787,10 +6787,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6941,9 +6941,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -6951,7 +6951,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6972,10 +6972,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7126,15 +7126,15 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 @@ -7152,10 +7152,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7285,18 +7285,18 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 @@ -7314,10 +7314,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7600,14 +7600,14 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7628,10 +7628,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7782,9 +7782,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -7792,7 +7792,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7813,10 +7813,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7967,15 +7967,15 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 @@ -7993,10 +7993,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8126,18 +8126,18 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 @@ -8155,10 +8155,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8441,14 +8441,14 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8469,10 +8469,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8623,9 +8623,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -8633,7 +8633,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8654,10 +8654,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8808,15 +8808,15 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 @@ -8834,10 +8834,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8967,18 +8967,18 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 @@ -8996,10 +8996,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9328,15 +9328,15 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9361,10 +9361,10 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9529,10 +9529,10 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -9540,7 +9540,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9565,10 +9565,10 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9732,16 +9732,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 @@ -9762,10 +9761,10 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9909,19 +9908,18 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 @@ -9942,10 +9940,10 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -10235,14 +10233,14 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10264,18 +10262,18 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10431,9 +10429,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -10441,7 +10439,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10463,18 +10461,18 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10627,15 +10625,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 @@ -10652,17 +10650,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -10792,18 +10790,18 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 @@ -10820,17 +10818,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -11114,14 +11112,14 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11143,11 +11141,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11316,9 +11314,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -11326,7 +11324,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11348,11 +11346,11 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11518,15 +11516,15 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 @@ -11543,10 +11541,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11689,18 +11687,18 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 @@ -11717,10 +11715,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 0d6f84edcb487..56215ca20651a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -2,10 +2,12 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; SI-LABEL: is_private_vgpr: @@ -57,6 +59,22 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; +; GFX1250-SDAG-LABEL: is_private_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v1 +; GFX1250-SDAG-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 +; GFX1250-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-SDAG-NEXT: s_endpgm +; ; CI-GISEL-LABEL: is_private_vgpr: ; CI-GISEL: ; %bb.0: ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -104,6 +122,22 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: is_private_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v1, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-GISEL-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id %ptr = load volatile ptr, ptr addrspace(1) %gep @@ -169,6 +203,24 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1 ; GFX9-SDAG-NEXT: s_endpgm ; +; GFX1250-SDAG-LABEL: is_private_sgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, s1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s0, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %bb0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: .LBB1_2: ; %bb1 +; GFX1250-SDAG-NEXT: s_endpgm +; ; CI-GISEL-LABEL: is_private_sgpr: ; CI-GISEL: ; %bb.0: ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -228,6 +280,22 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB1_2: ; %bb1 ; GFX11-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: is_private_sgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, s0 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s0, 0x4000000 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %bb0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: .LBB1_2: ; %bb1 +; GFX1250-GISEL-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.private(ptr %ptr) br i1 %val, label %bb0, label %bb1 @@ -243,4 +311,5 @@ bb1: ; CI: {{.*}} ; GFX10-GISEL: {{.*}} ; GFX11-GISEL: {{.*}} +; GFX1250: {{.*}} ; SI-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index e0bdd77bd18e2..307ff046d48c2 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -380,8 +380,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v6, src_flat_scratch_base_hi ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 @@ -390,7 +389,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0 ; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_xor_b32_e32 v0, s2, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v5, v6 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -412,11 +411,11 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 ; GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_sub_nc_u32_e32 v0, v4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GISEL-NEXT: s_wait_loadcnt 0x0