diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index fed37788802b9..82789bc4968c5 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -722,7 +722,8 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { return false; } - if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) { + if (New->getReg().isVirtual() && + !MRI->constrainRegClass(New->getReg(), ConstrainRC)) { LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI) << TRI->getRegClassName(ConstrainRC) << '\n'); return false; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index e4b3528b432bb..0189e7b90ca94 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -306,7 +306,8 @@ class PrologEpilogSGPRSpillBuilder { buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, FI, FrameReg, DwordOff); - MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); + assert(SubReg.isPhysical()); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpVGPR, RegState::Kill); DwordOff += 4; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 56435a50c87ad..cda8069936af2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2112,8 +2112,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::SI_RESTORE_S32_FROM_VGPR: MI.setDesc(get(AMDGPU::V_READLANE_B32)); - MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(), - &AMDGPU::SReg_32_XM0RegClass); break; case AMDGPU::AV_MOV_B32_IMM_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); @@ -8117,21 +8115,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // hope for the best. if (Inst.isCopy() && DstReg.isPhysical() && RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { - // TODO: Only works for 32 bit registers. - if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) { - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), - get(AMDGPU::V_READFIRSTLANE_B32), DstReg) - .add(Inst.getOperand(1)); - } else { - Register NewDst = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), - get(AMDGPU::V_READFIRSTLANE_B32), NewDst) - .add(Inst.getOperand(1)); - BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), - DstReg) - .addReg(NewDst); - } + Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), NewDst) + .add(Inst.getOperand(1)); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), + DstReg) + .addReg(NewDst); + Inst.eraseFromParent(); return; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 205237fefe785..3c2dd4252c583 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2222,8 +2222,6 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, // Don't need to write VGPR out. } - MachineRegisterInfo &MRI = MI->getMF()->getRegInfo(); - // Restore clobbered registers in the specified restore block. MI = RestoreMBB.end(); SB.setMI(&RestoreMBB, MI); @@ -2238,7 +2236,8 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, SB.NumSubRegs == 1 ? SB.SuperReg : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); - MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass); + + assert(SubReg.isPhysical()); bool LastSubReg = (i + 1 == e); auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) @@ -3059,8 +3058,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (IsSALU && LiveSCC) { Register NewDest; if (IsCopy) { - MF->getRegInfo().constrainRegClass(ResultReg, - &AMDGPU::SReg_32_XM0RegClass); + assert(ResultReg.isPhysical()); NewDest = ResultReg; } else { NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, @@ -3190,8 +3188,6 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Register NewDest; if (IsCopy) { - MF->getRegInfo().constrainRegClass(ResultReg, - &AMDGPU::SReg_32_XM0RegClass); NewDest = ResultReg; } else { NewDest = RS->scavengeRegisterBackwards( diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll index aac499f2fc602..b486fabb19497 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll @@ -9,15 +9,14 @@ target triple = "amdgcn-amd-amdhsa" define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0 -; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0 +; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s0, -1 ; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1] ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1 ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo @@ -56,13 +55,11 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1] ; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: s_endpgm @@ -91,10 +88,9 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) { ; GFX1250-LABEL: use_flat_to_private_addrspacecast: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS @@ -110,9 +106,8 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: s_endpgm @@ -122,9 +117,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index ef52694910da3..54871a622189b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -538,58 +538,61 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB34_6 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: s_cbranch_execnz .LBB34_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB34_8 +; GFX1250-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB34_3: ; %atomicrmw.check.private +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB34_3 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execz .LBB34_5 +; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB34_3: ; %Flow +; GFX1250-NEXT: .LBB34_5: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB34_5 -; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB34_7 +; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB34_5: ; %Flow1 +; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-NEXT: .LBB34_7: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB34_6: ; %Flow2 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB34_8 -; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-NEXT: s_cbranch_execz .LBB34_2 +; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.shared +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3] -; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.phi +; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo +; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val monotonic ret double %result @@ -600,58 +603,61 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB35_6 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: s_cbranch_execnz .LBB35_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB35_8 +; GFX1250-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB35_3: ; %atomicrmw.check.private +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB35_3 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execz .LBB35_5 +; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB35_3: ; %Flow +; GFX1250-NEXT: .LBB35_5: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB35_5 -; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB35_7 +; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB35_5: ; %Flow1 +; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-NEXT: .LBB35_7: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB35_6: ; %Flow2 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB35_8 -; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-NEXT: s_cbranch_execz .LBB35_2 +; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.shared +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3] -; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.phi +; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo +; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -686,40 +692,42 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB38_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB38_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB38_4 +; GFX1250-NEXT: .LBB38_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB38_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB38_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB38_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB38_2 +; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val monotonic ret double %result @@ -730,40 +738,42 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB39_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB39_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB39_4 +; GFX1250-NEXT: .LBB39_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB39_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB39_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB39_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB39_2 +; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -798,40 +808,42 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB42_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB42_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB42_4 +; GFX1250-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB42_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB42_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB42_2 +; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val monotonic ret double %result @@ -842,40 +854,42 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB43_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB43_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB43_4 +; GFX1250-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB43_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB43_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB43_2 +; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -982,13 +996,11 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1000,10 +1012,9 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB52_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1025,13 +1036,11 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1043,10 +1052,9 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB53_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1068,13 +1076,11 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB54_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1086,10 +1092,9 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB54_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1111,13 +1116,11 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB55_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1129,10 +1132,9 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB55_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1154,13 +1156,11 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB56_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1172,10 +1172,9 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB56_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1197,13 +1196,11 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB57_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1215,10 +1212,9 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB57_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1240,13 +1236,11 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1258,10 +1252,9 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB58_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1283,13 +1276,11 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1301,10 +1292,9 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB59_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 1e7855ccb3642..eefc7811d42b6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -541,11 +541,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -570,9 +569,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -586,14 +584,13 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -618,10 +615,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -727,13 +723,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -754,9 +749,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -770,8 +764,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -780,7 +773,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -805,10 +798,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -917,11 +909,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -943,9 +934,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off @@ -953,15 +943,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 @@ -982,10 +971,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1069,11 +1057,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3 @@ -1094,9 +1080,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off @@ -1104,8 +1089,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1115,7 +1099,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 @@ -1136,10 +1120,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1400,11 +1383,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1429,9 +1411,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1445,14 +1426,13 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1477,10 +1457,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1590,13 +1569,12 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1617,9 +1595,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1633,8 +1610,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -1643,7 +1619,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1668,10 +1644,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1784,11 +1759,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -1810,9 +1784,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1823,15 +1796,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 @@ -1852,10 +1824,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1950,11 +1921,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3 @@ -1975,9 +1944,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1988,8 +1956,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1999,7 +1966,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 @@ -2020,10 +1987,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2295,11 +2261,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2324,9 +2289,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2340,14 +2304,13 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2372,10 +2335,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2487,13 +2449,12 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2514,9 +2475,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2530,8 +2490,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -2540,7 +2499,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2565,10 +2524,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2683,11 +2641,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2709,9 +2666,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2722,15 +2678,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 @@ -2751,10 +2706,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2851,11 +2805,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3 @@ -2876,9 +2828,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2889,8 +2840,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2900,7 +2850,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 @@ -2921,10 +2871,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3198,11 +3147,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3227,9 +3175,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3244,14 +3191,13 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3276,10 +3222,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3390,13 +3335,12 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3417,9 +3361,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3434,8 +3377,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -3444,7 +3386,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3469,10 +3411,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3586,11 +3527,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3612,9 +3552,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3626,15 +3565,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 @@ -3655,10 +3593,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3754,11 +3691,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3 @@ -3779,9 +3714,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3793,8 +3727,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3804,7 +3737,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 @@ -3825,10 +3758,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4101,11 +4033,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4130,9 +4061,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4147,14 +4077,13 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4179,10 +4108,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4293,13 +4221,12 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4320,9 +4247,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4337,8 +4263,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -4347,7 +4272,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4372,10 +4297,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4489,11 +4413,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -4515,9 +4438,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4529,15 +4451,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 @@ -4558,10 +4479,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4657,11 +4577,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3 @@ -4682,9 +4600,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4696,8 +4613,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4707,7 +4623,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 @@ -4728,10 +4644,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5004,11 +4919,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5033,9 +4947,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5050,14 +4963,13 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5082,10 +4994,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5196,13 +5107,12 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5223,9 +5133,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5240,8 +5149,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -5250,7 +5158,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5275,10 +5183,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5392,11 +5299,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5418,9 +5324,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5432,15 +5337,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 @@ -5461,10 +5365,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5560,11 +5463,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3 @@ -5585,9 +5486,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5599,8 +5499,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5610,7 +5509,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 @@ -5631,10 +5530,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5877,11 +5775,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5902,9 +5799,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5918,14 +5814,13 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5946,10 +5841,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6061,13 +5955,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -6084,9 +5977,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6100,8 +5992,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -6110,7 +6001,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6131,10 +6022,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6249,11 +6139,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -6272,9 +6161,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6285,15 +6173,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 @@ -6311,10 +6198,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6409,11 +6295,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3 @@ -6431,9 +6315,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6444,8 +6327,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6455,7 +6337,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 @@ -6473,10 +6355,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6718,11 +6599,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6743,9 +6623,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6759,14 +6638,13 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6787,10 +6665,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6902,13 +6779,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -6925,9 +6801,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6941,8 +6816,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -6951,7 +6825,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6972,10 +6846,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7090,11 +6963,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -7113,9 +6985,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7126,15 +6997,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 @@ -7152,10 +7022,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7250,11 +7119,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3 @@ -7272,9 +7139,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7285,8 +7151,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7296,7 +7161,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 @@ -7314,10 +7179,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7559,11 +7423,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7584,9 +7447,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7600,14 +7462,13 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7628,10 +7489,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7743,13 +7603,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -7766,9 +7625,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7782,8 +7640,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -7792,7 +7649,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7813,10 +7670,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7931,11 +7787,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -7954,9 +7809,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7967,15 +7821,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 @@ -7993,10 +7846,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8091,11 +7943,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3 @@ -8113,9 +7963,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8126,8 +7975,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8137,7 +7985,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 @@ -8155,10 +8003,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8400,11 +8247,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -8425,9 +8271,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8441,14 +8286,13 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8469,10 +8313,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8584,13 +8427,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -8607,9 +8449,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8623,8 +8464,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -8633,7 +8473,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8654,10 +8494,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8772,11 +8611,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -8795,9 +8633,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8808,15 +8645,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 @@ -8834,10 +8670,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8932,11 +8767,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3 @@ -8954,9 +8787,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8967,8 +8799,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8978,7 +8809,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 @@ -8996,10 +8827,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9281,12 +9111,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -9311,9 +9140,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -9328,15 +9156,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9361,10 +9188,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9485,13 +9311,12 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -9512,9 +9337,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -9529,10 +9353,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -9540,7 +9363,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9565,10 +9388,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9690,13 +9512,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -9718,9 +9539,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -9732,15 +9552,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 @@ -9761,10 +9581,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9869,11 +9688,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3 @@ -9894,9 +9711,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -9908,18 +9724,18 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 @@ -9940,10 +9756,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -10188,11 +10003,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -10214,10 +10028,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10233,14 +10046,13 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10262,18 +10074,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10386,13 +10197,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -10410,10 +10220,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10429,8 +10238,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -10439,7 +10247,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10461,18 +10269,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10588,11 +10395,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -10610,9 +10416,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10625,15 +10430,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 @@ -10650,17 +10454,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -10754,11 +10557,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3 @@ -10775,9 +10576,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10790,8 +10590,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10801,7 +10600,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 @@ -10818,17 +10617,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -11064,11 +10862,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -11090,10 +10887,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -11112,14 +10908,13 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11141,11 +10936,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11268,13 +11062,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -11292,10 +11085,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -11314,8 +11106,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -11324,7 +11115,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11346,11 +11137,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11476,11 +11266,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -11498,9 +11287,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -11516,15 +11304,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 @@ -11541,10 +11328,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11648,11 +11434,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3 @@ -11669,9 +11453,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -11687,8 +11470,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11698,7 +11480,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 @@ -11715,10 +11497,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11834,12 +11615,10 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global @@ -11855,9 +11634,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB110_4: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -11891,10 +11668,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 @@ -11910,9 +11686,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12060,12 +11834,10 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5 @@ -12079,9 +11851,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 ; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12112,9 +11882,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 @@ -12130,9 +11899,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12261,9 +12028,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12279,10 +12045,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB112_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12301,10 +12066,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 @@ -12320,10 +12084,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12413,9 +12176,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12434,11 +12196,10 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 ; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 -; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12454,9 +12215,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 @@ -12473,10 +12233,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12562,9 +12321,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12580,10 +12338,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB114_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12602,10 +12359,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 @@ -12621,10 +12377,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12714,9 +12469,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12735,11 +12489,10 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 ; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 -; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12755,9 +12508,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 @@ -12774,10 +12526,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 56215ca20651a..67d0410434a99 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -59,21 +59,20 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; -; GFX1250-SDAG-LABEL: is_private_vgpr: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v1 -; GFX1250-SDAG-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 -; GFX1250-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v0, off -; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-LABEL: is_private_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm ; ; CI-GISEL-LABEL: is_private_vgpr: ; CI-GISEL: ; %bb.0: @@ -122,22 +121,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: is_private_vgpr: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v1, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 -; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v0, off -; GFX1250-GISEL-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id %ptr = load volatile ptr, ptr addrspace(1) %gep @@ -206,9 +189,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX1250-SDAG-LABEL: is_private_sgpr: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, s1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s0, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, -1, 0 @@ -285,9 +267,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, s0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s0, 0x4000000 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %bb0 @@ -311,5 +292,4 @@ bb1: ; CI: {{.*}} ; GFX10-GISEL: {{.*}} ; GFX11-GISEL: {{.*}} -; GFX1250: {{.*}} ; SI-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index 335d58c43c936..a18847b56a330 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -324,11 +324,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] -; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -350,10 +348,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; SDAG-NEXT: s_cbranch_execz .LBB21_2 ; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private -; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -367,12 +364,12 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v0, src_flat_scratch_base_hi +; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GISEL-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 3, s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_xor_b32_e32 v0, v5, v0 +; GISEL-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -394,11 +391,10 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 ; GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_sub_nc_u32_e32 v0, v4, v0 +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GISEL-NEXT: s_wait_loadcnt 0x0