From 65e816400db6e765bcd8e860ee7a6304938229c7 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 10 Nov 2025 12:50:36 -0500 Subject: [PATCH 1/7] revised initial commit --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 ++++++++++++++++++++++++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e5f0e3e631988..3e6bd3b164a14 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1313,6 +1313,27 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, return Reg; } +MachineInstr * +SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + int64_t SubRegValues[2]; + bool SubRegIsConst[2]; + MachineInstr *RealDefs[2]; + for (unsigned I : {2, 4}) { + unsigned ArrayIdx = MI.getOperand(I).getImm() == AMDGPU::sub0 ? 0 : 1; + Register Subreg = MI.getOperand(I - 1).getReg(); + RealDefs[ArrayIdx] = MRI.getUniqueVRegDef(Subreg); + SubRegIsConst[ArrayIdx] = getConstValDefinedInReg( + *RealDefs[ArrayIdx], Subreg, SubRegValues[ArrayIdx]); + } + + for (unsigned I : {0, 1}) + if (SubRegIsConst[I] && !SubRegValues[I]) + return RealDefs[(I + 1) % 2]; + + return nullptr; +} + bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const { @@ -10698,6 +10719,9 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!Def) return false; + if (MachineInstr *RegSequenceDef = pierceThroughRegSequence(*Def)) + Def = RegSequenceDef; + // For S_OP that set SCC = DST!=0, do the transformation // // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c048b85b1e99a..1d5353bd225b2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -711,6 +711,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { } } + MachineInstr *pierceThroughRegSequence(const MachineInstr &MI) const; + static bool setsSCCifResultIsNonZero(const MachineInstr &MI) { switch (MI.getOpcode()) { case AMDGPU::S_ABSDIFF_I32: From 40a6671c50371e2a6b797cc42adc7e8096e53892 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 10 Nov 2025 13:04:15 -0500 Subject: [PATCH 2/7] Fix --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 3e6bd3b164a14..28967ebfb8a57 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1315,6 +1315,9 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, MachineInstr * SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const { + if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) + return nullptr; + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int64_t SubRegValues[2]; bool SubRegIsConst[2]; From 960bdd313b29a22fef4c87b5395ea9ba677396c7 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 10 Nov 2025 13:06:06 -0500 Subject: [PATCH 3/7] Add testcase passing on main --- .../AMDGPU/redundant-cmp-reg-sequence.ll | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll diff --git a/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll new file mode 100644 index 0000000000000..00e479fe2eccc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s +define amdgpu_ps i64 @ordertest(i64 inreg %val0) { +; CHECK-LABEL: ordertest: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshr_b32 s0, s1, 2 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; CHECK-NEXT: v_lshrrev_b64 v[0:1], v2, s[0:1] +; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %shl = lshr i64 %val0, 34 + %result = and i64 %shl, 4294967295 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i64 + %param0 = lshr i64 %shl, %zext + %param = and i64 %param0, 4294967295 + %xory = xor i64 %zext, %param + ret i64 %xory +} From 51956c903cb71c3d1488836f66189f59c9999e3a Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 10 Nov 2025 13:06:34 -0500 Subject: [PATCH 4/7] Add testcase passing on branch --- llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll index 00e479fe2eccc..750a9027d47a4 100644 --- a/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll +++ b/llvm/test/CodeGen/AMDGPU/redundant-cmp-reg-sequence.ll @@ -4,9 +4,8 @@ define amdgpu_ps i64 @ordertest(i64 inreg %val0) { ; CHECK-LABEL: ordertest: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshr_b32 s0, s1, 2 -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; CHECK-NEXT: v_lshrrev_b64 v[0:1], v2, s[0:1] ; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 From de0102e8ec9b3a41e5e89a6d94881fe3b8df7f76 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 10 Nov 2025 13:29:14 -0500 Subject: [PATCH 5/7] Update testcases --- .../test/CodeGen/AMDGPU/carryout-selection.ll | 346 +++++++++--------- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 4 - llvm/test/CodeGen/AMDGPU/srem.ll | 98 ++--- llvm/test/CodeGen/AMDGPU/wave32.ll | 115 +++--- 4 files changed, 263 insertions(+), 300 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 8d05317162e9c..e7de08537883f 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -2116,8 +2116,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] -; VI-NEXT: s_mov_b32 s6, 0 -; VI-NEXT: s_cmp_lg_u64 s[6:7], 0 ; VI-NEXT: s_cbranch_scc0 .LBB16_3 ; VI-NEXT: ; %bb.1: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2268,8 +2266,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2418,10 +2414,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_clause 0x1 ; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1010-NEXT: s_mov_b32 s8, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] -; GFX1010-NEXT: s_mov_b32 s4, 0 -; GFX1010-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1010-NEXT: ; %bb.1: ; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2436,71 +2431,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1010-NEXT: v_readfirstlane_b32 s5, v1 -; GFX1010-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1010-NEXT: s_mul_i32 s11, s9, s5 -; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1010-NEXT: s_mul_i32 s12, s10, s8 +; GFX1010-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1010-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1010-NEXT: s_mul_i32 s11, s9, s4 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s5 +; GFX1010-NEXT: s_mul_i32 s12, s10, s5 ; GFX1010-NEXT: s_add_i32 s11, s13, s11 -; GFX1010-NEXT: s_mul_i32 s14, s9, s8 +; GFX1010-NEXT: s_mul_i32 s14, s9, s5 ; GFX1010-NEXT: s_add_i32 s11, s11, s12 -; GFX1010-NEXT: s_mul_hi_u32 s13, s8, s14 -; GFX1010-NEXT: s_mul_i32 s16, s8, s11 -; GFX1010-NEXT: s_mul_hi_u32 s15, s5, s14 -; GFX1010-NEXT: s_mul_i32 s12, s5, s14 -; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s14 +; GFX1010-NEXT: s_mul_i32 s16, s5, s11 +; GFX1010-NEXT: s_mul_hi_u32 s15, s4, s14 +; GFX1010-NEXT: s_mul_i32 s12, s4, s14 +; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s11 ; GFX1010-NEXT: s_add_u32 s13, s13, s16 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s17, s5, s11 +; GFX1010-NEXT: s_mul_hi_u32 s17, s4, s11 ; GFX1010-NEXT: s_add_u32 s12, s13, s12 -; GFX1010-NEXT: s_mul_i32 s11, s5, s11 +; GFX1010-NEXT: s_mul_i32 s11, s4, s11 ; GFX1010-NEXT: s_addc_u32 s12, s14, s15 ; GFX1010-NEXT: s_addc_u32 s13, s17, 0 ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 -; GFX1010-NEXT: s_add_u32 s8, s8, s11 -; GFX1010-NEXT: s_addc_u32 s5, s5, s12 -; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1010-NEXT: s_mul_i32 s12, s9, s8 -; GFX1010-NEXT: s_mul_i32 s9, s9, s5 -; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_add_u32 s5, s5, s11 +; GFX1010-NEXT: s_addc_u32 s4, s4, s12 +; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s5 +; GFX1010-NEXT: s_mul_i32 s12, s9, s5 +; GFX1010-NEXT: s_mul_i32 s9, s9, s4 +; GFX1010-NEXT: s_mul_i32 s10, s10, s5 ; GFX1010-NEXT: s_add_i32 s9, s11, s9 -; GFX1010-NEXT: s_mul_i32 s11, s5, s12 +; GFX1010-NEXT: s_mul_i32 s11, s4, s12 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 -; GFX1010-NEXT: s_mul_i32 s15, s8, s9 -; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1010-NEXT: s_mul_hi_u32 s10, s5, s12 +; GFX1010-NEXT: s_mul_i32 s15, s5, s9 +; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s9 ; GFX1010-NEXT: s_add_u32 s10, s10, s15 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX1010-NEXT: s_mul_hi_u32 s13, s4, s12 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1010-NEXT: s_mul_hi_u32 s12, s4, s9 ; GFX1010-NEXT: s_add_u32 s10, s10, s11 -; GFX1010-NEXT: s_mul_i32 s9, s5, s9 +; GFX1010-NEXT: s_mul_i32 s9, s4, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 ; GFX1010-NEXT: s_addc_u32 s11, s12, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 -; GFX1010-NEXT: s_add_u32 s8, s8, s9 -; GFX1010-NEXT: s_addc_u32 s5, s5, s10 -; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1010-NEXT: s_mul_i32 s12, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 -; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 -; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_add_u32 s5, s5, s9 +; GFX1010-NEXT: s_addc_u32 s4, s4, s10 +; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s5 +; GFX1010-NEXT: s_mul_i32 s12, s2, s4 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s4 +; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s5 +; GFX1010-NEXT: s_mul_i32 s5, s3, s5 ; GFX1010-NEXT: s_add_u32 s9, s9, s12 ; GFX1010-NEXT: s_addc_u32 s11, 0, s11 -; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1010-NEXT: s_add_u32 s8, s9, s8 -; GFX1010-NEXT: s_mul_i32 s5, s3, s5 -; GFX1010-NEXT: s_addc_u32 s8, s11, s10 +; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s4 +; GFX1010-NEXT: s_add_u32 s5, s9, s5 +; GFX1010-NEXT: s_mul_i32 s4, s3, s4 +; GFX1010-NEXT: s_addc_u32 s5, s11, s10 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 -; GFX1010-NEXT: s_add_u32 s5, s8, s5 -; GFX1010-NEXT: s_addc_u32 s8, 0, s9 -; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s5 -; GFX1010-NEXT: s_mul_i32 s10, s6, s8 -; GFX1010-NEXT: s_mul_i32 s11, s7, s5 -; GFX1010-NEXT: s_add_i32 s9, s9, s10 +; GFX1010-NEXT: s_add_u32 s4, s5, s4 +; GFX1010-NEXT: s_addc_u32 s5, 0, s9 +; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s4 ; GFX1010-NEXT: s_mul_i32 s10, s6, s5 +; GFX1010-NEXT: s_mul_i32 s11, s7, s4 +; GFX1010-NEXT: s_add_i32 s9, s9, s10 +; GFX1010-NEXT: s_mul_i32 s10, s6, s4 ; GFX1010-NEXT: s_add_i32 s9, s9, s11 ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 @@ -2514,10 +2509,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1010-NEXT: s_cmp_eq_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s11, s13, s14 -; GFX1010-NEXT: s_add_u32 s13, s5, 1 -; GFX1010-NEXT: s_addc_u32 s14, s8, 0 -; GFX1010-NEXT: s_add_u32 s15, s5, 2 -; GFX1010-NEXT: s_addc_u32 s16, s8, 0 +; GFX1010-NEXT: s_add_u32 s13, s4, 1 +; GFX1010-NEXT: s_addc_u32 s14, s5, 0 +; GFX1010-NEXT: s_add_u32 s15, s4, 2 +; GFX1010-NEXT: s_addc_u32 s16, s5, 0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1010-NEXT: s_cselect_b32 s11, s15, s13 ; GFX1010-NEXT: s_cselect_b32 s13, s16, s14 @@ -2530,14 +2525,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cmp_eq_u32 s3, s7 ; GFX1010-NEXT: s_cselect_b32 s3, s10, s9 ; GFX1010-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1010-NEXT: s_cselect_b32 s9, s13, s8 -; GFX1010-NEXT: s_cselect_b32 s8, s11, s5 -; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 +; GFX1010-NEXT: s_cselect_b32 s5, s13, s5 +; GFX1010-NEXT: s_cselect_b32 s4, s11, s4 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: ; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX1010-NEXT: s_sub_i32 s4, 0, s6 -; GFX1010-NEXT: s_mov_b32 s9, 0 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2555,15 +2549,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cselect_b32 s2, s5, s2 ; GFX1010-NEXT: s_add_i32 s4, s3, 1 ; GFX1010-NEXT: s_cmp_ge_u32 s2, s6 -; GFX1010-NEXT: s_cselect_b32 s8, s4, s3 +; GFX1010-NEXT: s_mov_b32 s5, 0 +; GFX1010-NEXT: s_cselect_b32 s4, s4, s3 ; GFX1010-NEXT: .LBB16_3: -; GFX1010-NEXT: v_mov_b32_e32 v0, s8 +; GFX1010-NEXT: v_mov_b32_e32 v0, s4 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s9 +; GFX1010-NEXT: v_mov_b32_e32 v1, s5 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; GFX1010-NEXT: .LBB16_4: -; GFX1010-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX1010-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1010-NEXT: s_branch .LBB16_2 ; ; GFX1030W32-LABEL: sudiv64: @@ -2571,10 +2566,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_clause 0x1 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX1030W32-NEXT: s_mov_b32 s8, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] -; GFX1030W32-NEXT: s_mov_b32 s6, 0 -; GFX1030W32-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W32-NEXT: ; %bb.1: ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2589,71 +2583,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v1 -; GFX1030W32-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1030W32-NEXT: s_mul_i32 s11, s9, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s10, s8 +; GFX1030W32-NEXT: v_readfirstlane_b32 s6, v1 +; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v0 +; GFX1030W32-NEXT: s_mul_i32 s11, s9, s6 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s7 +; GFX1030W32-NEXT: s_mul_i32 s12, s10, s7 ; GFX1030W32-NEXT: s_add_i32 s11, s13, s11 -; GFX1030W32-NEXT: s_mul_i32 s14, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s14, s9, s7 ; GFX1030W32-NEXT: s_add_i32 s11, s11, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s8, s14 -; GFX1030W32-NEXT: s_mul_i32 s16, s8, s11 -; GFX1030W32-NEXT: s_mul_hi_u32 s15, s7, s14 -; GFX1030W32-NEXT: s_mul_i32 s12, s7, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s14 +; GFX1030W32-NEXT: s_mul_i32 s16, s7, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s15, s6, s14 +; GFX1030W32-NEXT: s_mul_i32 s12, s6, s14 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s11 ; GFX1030W32-NEXT: s_add_u32 s13, s13, s16 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s17, s7, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s17, s6, s11 ; GFX1030W32-NEXT: s_add_u32 s12, s13, s12 -; GFX1030W32-NEXT: s_mul_i32 s11, s7, s11 +; GFX1030W32-NEXT: s_mul_i32 s11, s6, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, s14, s15 ; GFX1030W32-NEXT: s_addc_u32 s13, s17, 0 ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 -; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 -; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 -; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_add_u32 s7, s7, s11 +; GFX1030W32-NEXT: s_addc_u32 s6, s6, s12 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s7 +; GFX1030W32-NEXT: s_mul_i32 s12, s9, s7 +; GFX1030W32-NEXT: s_mul_i32 s9, s9, s6 +; GFX1030W32-NEXT: s_mul_i32 s10, s10, s7 ; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 -; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 +; GFX1030W32-NEXT: s_mul_i32 s11, s6, s12 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 -; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s7, s12 +; GFX1030W32-NEXT: s_mul_i32 s15, s7, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s9 ; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s6, s12 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s6, s9 ; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 -; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 +; GFX1030W32-NEXT: s_mul_i32 s9, s6, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 ; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 -; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 -; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 -; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_add_u32 s7, s7, s9 +; GFX1030W32-NEXT: s_addc_u32 s6, s6, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s7 +; GFX1030W32-NEXT: s_mul_i32 s12, s2, s6 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s6 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s7 +; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 ; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 ; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 -; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s6 +; GFX1030W32-NEXT: s_add_u32 s7, s9, s7 +; GFX1030W32-NEXT: s_mul_i32 s6, s3, s6 +; GFX1030W32-NEXT: s_addc_u32 s7, s11, s10 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 -; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 -; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s7 -; GFX1030W32-NEXT: s_mul_i32 s10, s4, s8 -; GFX1030W32-NEXT: s_mul_i32 s11, s5, s7 -; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W32-NEXT: s_add_u32 s6, s7, s6 +; GFX1030W32-NEXT: s_addc_u32 s7, 0, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s6 ; GFX1030W32-NEXT: s_mul_i32 s10, s4, s7 +; GFX1030W32-NEXT: s_mul_i32 s11, s5, s6 +; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W32-NEXT: s_mul_i32 s10, s4, s6 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s11 ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 @@ -2667,10 +2661,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1030W32-NEXT: s_cmp_eq_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s11, s13, s14 -; GFX1030W32-NEXT: s_add_u32 s13, s7, 1 -; GFX1030W32-NEXT: s_addc_u32 s14, s8, 0 -; GFX1030W32-NEXT: s_add_u32 s15, s7, 2 -; GFX1030W32-NEXT: s_addc_u32 s16, s8, 0 +; GFX1030W32-NEXT: s_add_u32 s13, s6, 1 +; GFX1030W32-NEXT: s_addc_u32 s14, s7, 0 +; GFX1030W32-NEXT: s_add_u32 s15, s6, 2 +; GFX1030W32-NEXT: s_addc_u32 s16, s7, 0 ; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1030W32-NEXT: s_cselect_b32 s11, s15, s13 ; GFX1030W32-NEXT: s_cselect_b32 s13, s16, s14 @@ -2683,14 +2677,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W32-NEXT: s_cselect_b32 s3, s10, s9 ; GFX1030W32-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1030W32-NEXT: s_cselect_b32 s9, s13, s8 -; GFX1030W32-NEXT: s_cselect_b32 s8, s11, s7 -; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 +; GFX1030W32-NEXT: s_cselect_b32 s7, s13, s7 +; GFX1030W32-NEXT: s_cselect_b32 s6, s11, s6 +; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1030W32-NEXT: .LBB16_2: ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W32-NEXT: s_sub_i32 s5, 0, s4 -; GFX1030W32-NEXT: s_mov_b32 s9, 0 +; GFX1030W32-NEXT: s_mov_b32 s7, 0 ; GFX1030W32-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2708,15 +2702,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cselect_b32 s2, s6, s2 ; GFX1030W32-NEXT: s_add_i32 s5, s3, 1 ; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4 -; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3 +; GFX1030W32-NEXT: s_cselect_b32 s6, s5, s3 ; GFX1030W32-NEXT: .LBB16_3: -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm ; GFX1030W32-NEXT: .LBB16_4: -; GFX1030W32-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX1030W32-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1030W32-NEXT: s_branch .LBB16_2 ; ; GFX1030W64-LABEL: sudiv64: @@ -2726,8 +2720,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] -; GFX1030W64-NEXT: s_mov_b32 s6, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2876,11 +2868,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] -; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2900,71 +2890,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s7, v1 -; GFX11-NEXT: v_readfirstlane_b32 s8, v0 -; GFX11-NEXT: s_mul_i32 s11, s9, s7 -; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX11-NEXT: s_mul_i32 s12, s10, s8 +; GFX11-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-NEXT: s_mul_i32 s11, s9, s6 +; GFX11-NEXT: s_mul_hi_u32 s13, s9, s7 +; GFX11-NEXT: s_mul_i32 s12, s10, s7 ; GFX11-NEXT: s_add_i32 s11, s13, s11 -; GFX11-NEXT: s_mul_i32 s14, s9, s8 +; GFX11-NEXT: s_mul_i32 s14, s9, s7 ; GFX11-NEXT: s_add_i32 s11, s11, s12 -; GFX11-NEXT: s_mul_hi_u32 s13, s8, s14 -; GFX11-NEXT: s_mul_i32 s16, s8, s11 -; GFX11-NEXT: s_mul_hi_u32 s15, s7, s14 -; GFX11-NEXT: s_mul_i32 s12, s7, s14 -; GFX11-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s14 +; GFX11-NEXT: s_mul_i32 s16, s7, s11 +; GFX11-NEXT: s_mul_hi_u32 s15, s6, s14 +; GFX11-NEXT: s_mul_i32 s12, s6, s14 +; GFX11-NEXT: s_mul_hi_u32 s14, s7, s11 ; GFX11-NEXT: s_add_u32 s13, s13, s16 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s17, s7, s11 +; GFX11-NEXT: s_mul_hi_u32 s17, s6, s11 ; GFX11-NEXT: s_add_u32 s12, s13, s12 -; GFX11-NEXT: s_mul_i32 s11, s7, s11 +; GFX11-NEXT: s_mul_i32 s11, s6, s11 ; GFX11-NEXT: s_addc_u32 s12, s14, s15 ; GFX11-NEXT: s_addc_u32 s13, s17, 0 ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 -; GFX11-NEXT: s_add_u32 s8, s8, s11 -; GFX11-NEXT: s_addc_u32 s7, s7, s12 -; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX11-NEXT: s_mul_i32 s12, s9, s8 -; GFX11-NEXT: s_mul_i32 s9, s9, s7 -; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_add_u32 s7, s7, s11 +; GFX11-NEXT: s_addc_u32 s6, s6, s12 +; GFX11-NEXT: s_mul_hi_u32 s11, s9, s7 +; GFX11-NEXT: s_mul_i32 s12, s9, s7 +; GFX11-NEXT: s_mul_i32 s9, s9, s6 +; GFX11-NEXT: s_mul_i32 s10, s10, s7 ; GFX11-NEXT: s_add_i32 s9, s11, s9 -; GFX11-NEXT: s_mul_i32 s11, s7, s12 +; GFX11-NEXT: s_mul_i32 s11, s6, s12 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 -; GFX11-NEXT: s_mul_i32 s15, s8, s9 -; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX11-NEXT: s_mul_hi_u32 s10, s7, s12 +; GFX11-NEXT: s_mul_i32 s15, s7, s9 +; GFX11-NEXT: s_mul_hi_u32 s14, s7, s9 ; GFX11-NEXT: s_add_u32 s10, s10, s15 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 +; GFX11-NEXT: s_mul_hi_u32 s13, s6, s12 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 +; GFX11-NEXT: s_mul_hi_u32 s12, s6, s9 ; GFX11-NEXT: s_add_u32 s10, s10, s11 -; GFX11-NEXT: s_mul_i32 s9, s7, s9 +; GFX11-NEXT: s_mul_i32 s9, s6, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 ; GFX11-NEXT: s_addc_u32 s11, s12, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 -; GFX11-NEXT: s_add_u32 s8, s8, s9 -; GFX11-NEXT: s_addc_u32 s7, s7, s10 -; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX11-NEXT: s_mul_i32 s12, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 -; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 -; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_add_u32 s7, s7, s9 +; GFX11-NEXT: s_addc_u32 s6, s6, s10 +; GFX11-NEXT: s_mul_hi_u32 s9, s2, s7 +; GFX11-NEXT: s_mul_i32 s12, s2, s6 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s6 +; GFX11-NEXT: s_mul_hi_u32 s10, s3, s7 +; GFX11-NEXT: s_mul_i32 s7, s3, s7 ; GFX11-NEXT: s_add_u32 s9, s9, s12 ; GFX11-NEXT: s_addc_u32 s11, 0, s11 -; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 -; GFX11-NEXT: s_add_u32 s8, s9, s8 -; GFX11-NEXT: s_mul_i32 s7, s3, s7 -; GFX11-NEXT: s_addc_u32 s8, s11, s10 +; GFX11-NEXT: s_mul_hi_u32 s13, s3, s6 +; GFX11-NEXT: s_add_u32 s7, s9, s7 +; GFX11-NEXT: s_mul_i32 s6, s3, s6 +; GFX11-NEXT: s_addc_u32 s7, s11, s10 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 -; GFX11-NEXT: s_add_u32 s7, s8, s7 -; GFX11-NEXT: s_addc_u32 s8, 0, s9 -; GFX11-NEXT: s_mul_hi_u32 s9, s4, s7 -; GFX11-NEXT: s_mul_i32 s10, s4, s8 -; GFX11-NEXT: s_mul_i32 s11, s5, s7 -; GFX11-NEXT: s_add_i32 s9, s9, s10 +; GFX11-NEXT: s_add_u32 s6, s7, s6 +; GFX11-NEXT: s_addc_u32 s7, 0, s9 +; GFX11-NEXT: s_mul_hi_u32 s9, s4, s6 ; GFX11-NEXT: s_mul_i32 s10, s4, s7 +; GFX11-NEXT: s_mul_i32 s11, s5, s6 +; GFX11-NEXT: s_add_i32 s9, s9, s10 +; GFX11-NEXT: s_mul_i32 s10, s4, s6 ; GFX11-NEXT: s_add_i32 s9, s9, s11 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 @@ -2980,10 +2970,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s11, s13, s14 -; GFX11-NEXT: s_add_u32 s13, s7, 1 -; GFX11-NEXT: s_addc_u32 s14, s8, 0 -; GFX11-NEXT: s_add_u32 s15, s7, 2 -; GFX11-NEXT: s_addc_u32 s16, s8, 0 +; GFX11-NEXT: s_add_u32 s13, s6, 1 +; GFX11-NEXT: s_addc_u32 s14, s7, 0 +; GFX11-NEXT: s_add_u32 s15, s6, 2 +; GFX11-NEXT: s_addc_u32 s16, s7, 0 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: s_cselect_b32 s11, s15, s13 ; GFX11-NEXT: s_cselect_b32 s13, s16, s14 @@ -2998,14 +2988,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cselect_b32 s3, s10, s9 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s3, 0 -; GFX11-NEXT: s_cselect_b32 s9, s13, s8 -; GFX11-NEXT: s_cselect_b32 s8, s11, s7 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cselect_b32 s7, s13, s7 +; GFX11-NEXT: s_cselect_b32 s6, s11, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 ; GFX11-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX11-NEXT: .LBB16_2: ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX11-NEXT: s_sub_i32 s5, 0, s4 -; GFX11-NEXT: s_mov_b32 s9, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -3028,15 +3018,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cselect_b32 s2, s6, s2 ; GFX11-NEXT: s_add_i32 s5, s3, 1 ; GFX11-NEXT: s_cmp_ge_u32 s2, s4 -; GFX11-NEXT: s_cselect_b32 s8, s5, s3 +; GFX11-NEXT: s_cselect_b32 s6, s5, s3 ; GFX11-NEXT: .LBB16_3: ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s8 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: -; GFX11-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX11-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX11-NEXT: s_branch .LBB16_2 ; ; GFX1250-LABEL: sudiv64: diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 0166d7ac7ddc2..1f965c16ef4f2 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -403,8 +403,6 @@ define amdgpu_ps i32 @bfe_i64(i64 inreg %val0) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x80000 ; CHECK-NEXT: s_and_b32 s0, s0, 0xff -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 @@ -440,7 +438,6 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_and_b32 s0, s0, 0xff ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -522,7 +519,6 @@ define amdgpu_ps i32 @bcnt164(i64 inreg %val0) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index e12e31b14e97d..3c3d634c96410 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1501,8 +1501,6 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB8_4 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_ashr_i32 s6, s3, 31 @@ -1832,8 +1830,6 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s3, v3 ; TONGA-NEXT: v_readfirstlane_b32 s2, v2 ; TONGA-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] -; TONGA-NEXT: s_mov_b32 s6, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 ; TONGA-NEXT: s_cbranch_scc0 .LBB8_3 ; TONGA-NEXT: ; %bb.1: ; TONGA-NEXT: s_ashr_i32 s6, s3, 31 @@ -2701,12 +2697,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s11, v5 ; GCN-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[8:9] -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-NEXT: v_readfirstlane_b32 s5, v7 -; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[8:9] ; GCN-NEXT: v_readfirstlane_b32 s4, v6 ; GCN-NEXT: s_cbranch_scc0 .LBB10_6 ; GCN-NEXT: ; %bb.1: @@ -2855,8 +2849,6 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s6, s9, s6 ; GCN-NEXT: .LBB10_3: ; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] -; GCN-NEXT: s_mov_b32 s8, 0 -; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB10_7 ; GCN-NEXT: ; %bb.4: ; GCN-NEXT: s_ashr_i32 s8, s3, 31 @@ -3344,8 +3336,6 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s3, v5 ; TONGA-NEXT: v_readfirstlane_b32 s2, v4 ; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] -; TONGA-NEXT: s_mov_b32 s6, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 ; TONGA-NEXT: s_cbranch_scc0 .LBB10_3 ; TONGA-NEXT: ; %bb.1: ; TONGA-NEXT: s_ashr_i32 s6, s1, 31 @@ -4878,8 +4868,6 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s19, v13 ; GCN-NEXT: v_readfirstlane_b32 s18, v12 -; GCN-NEXT: s_or_b64 s[6:7], s[18:19], s[16:17] -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 @@ -4890,9 +4878,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_readfirstlane_b32 s11, v9 ; GCN-NEXT: v_readfirstlane_b32 s10, v8 ; GCN-NEXT: v_readfirstlane_b32 s15, v15 -; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GCN-NEXT: s_or_b64 s[6:7], s[18:19], s[16:17] ; GCN-NEXT: v_readfirstlane_b32 s14, v14 -; GCN-NEXT: s_cbranch_scc0 .LBB12_6 +; GCN-NEXT: s_cbranch_scc0 .LBB12_10 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_ashr_i32 s6, s17, 31 ; GCN-NEXT: s_add_u32 s20, s16, s6 @@ -5039,9 +5027,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s6, s17, s6 ; GCN-NEXT: .LBB12_3: ; GCN-NEXT: s_or_b64 s[16:17], s[14:15], s[12:13] -; GCN-NEXT: s_mov_b32 s16, 0 -; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 -; GCN-NEXT: s_cbranch_scc0 .LBB12_7 +; GCN-NEXT: s_cbranch_scc0 .LBB12_11 ; GCN-NEXT: ; %bb.4: ; GCN-NEXT: s_ashr_i32 s16, s13, 31 ; GCN-NEXT: s_add_u32 s18, s12, s16 @@ -5165,7 +5151,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b64 s[18:19], s[18:19], s[20:21] ; GCN-NEXT: s_sub_u32 s18, s18, s20 ; GCN-NEXT: s_subb_u32 s19, s19, s20 -; GCN-NEXT: s_cbranch_execnz .LBB12_8 +; GCN-NEXT: s_cbranch_execnz .LBB12_12 ; GCN-NEXT: .LBB12_5: ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: s_sub_i32 s13, 0, s12 @@ -5185,22 +5171,35 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subrev_u32_e32 v1, s12, v0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-NEXT: s_branch .LBB12_9 +; GCN-NEXT: s_or_b64 s[12:13], s[10:11], s[8:9] +; GCN-NEXT: s_cbranch_scc1 .LBB12_13 ; GCN-NEXT: .LBB12_6: +; GCN-NEXT: ; implicit-def: $sgpr14_sgpr15 +; GCN-NEXT: s_branch .LBB12_14 +; GCN-NEXT: .LBB12_7: +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] +; GCN-NEXT: s_cbranch_scc1 .LBB12_15 +; GCN-NEXT: .LBB12_8: +; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GCN-NEXT: s_branch .LBB12_16 +; GCN-NEXT: .LBB12_9: +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: s_branch .LBB12_17 +; GCN-NEXT: .LBB12_10: ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NEXT: s_branch .LBB12_2 -; GCN-NEXT: .LBB12_7: +; GCN-NEXT: .LBB12_11: ; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_branch .LBB12_5 -; GCN-NEXT: .LBB12_8: +; GCN-NEXT: .LBB12_12: ; GCN-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: .LBB12_9: ; GCN-NEXT: s_or_b64 s[12:13], s[10:11], s[8:9] -; GCN-NEXT: s_mov_b32 s12, 0 -; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 -; GCN-NEXT: s_cbranch_scc0 .LBB12_12 -; GCN-NEXT: ; %bb.10: +; GCN-NEXT: s_cbranch_scc0 .LBB12_6 +; GCN-NEXT: .LBB12_13: ; GCN-NEXT: s_ashr_i32 s12, s9, 31 ; GCN-NEXT: s_add_u32 s14, s8, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5323,8 +5322,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b64 s[14:15], s[14:15], s[16:17] ; GCN-NEXT: s_sub_u32 s14, s14, s16 ; GCN-NEXT: s_subb_u32 s15, s15, s16 -; GCN-NEXT: s_cbranch_execnz .LBB12_13 -; GCN-NEXT: .LBB12_11: +; GCN-NEXT: s_cbranch_execnz .LBB12_7 +; GCN-NEXT: .LBB12_14: ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_sub_i32 s9, 0, s8 ; GCN-NEXT: v_mov_b32_e32 v5, 0 @@ -5343,19 +5342,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subrev_u32_e32 v1, s8, v0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GCN-NEXT: s_branch .LBB12_14 -; GCN-NEXT: .LBB12_12: -; GCN-NEXT: ; implicit-def: $sgpr14_sgpr15 -; GCN-NEXT: s_branch .LBB12_11 -; GCN-NEXT: .LBB12_13: -; GCN-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NEXT: .LBB12_14: ; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] -; GCN-NEXT: s_mov_b32 s8, 0 -; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GCN-NEXT: s_cbranch_scc0 .LBB12_17 -; GCN-NEXT: ; %bb.15: +; GCN-NEXT: s_cbranch_scc0 .LBB12_8 +; GCN-NEXT: .LBB12_15: ; GCN-NEXT: s_ashr_i32 s8, s3, 31 ; GCN-NEXT: s_add_u32 s10, s2, s8 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -5478,7 +5467,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] ; GCN-NEXT: s_sub_u32 s10, s10, s12 ; GCN-NEXT: s_subb_u32 s11, s11, s12 -; GCN-NEXT: s_cbranch_execnz .LBB12_18 +; GCN-NEXT: s_cbranch_execnz .LBB12_9 ; GCN-NEXT: .LBB12_16: ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_sub_i32 s3, 0, s2 @@ -5498,14 +5487,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subrev_u32_e32 v1, s2, v0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; GCN-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GCN-NEXT: s_branch .LBB12_19 ; GCN-NEXT: .LBB12_17: -; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11 -; GCN-NEXT: s_branch .LBB12_16 -; GCN-NEXT: .LBB12_18: -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: .LBB12_19: ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 @@ -6119,23 +6101,23 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_add_u32 s0, s6, 48 ; TONGA-NEXT: v_mov_b32_e32 v0, s6 -; TONGA-NEXT: s_addc_u32 s1, s7, 0 +; TONGA-NEXT: s_add_u32 s0, s6, 48 ; TONGA-NEXT: v_mov_b32_e32 v1, s7 -; TONGA-NEXT: s_add_u32 s2, s6, 32 +; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] -; TONGA-NEXT: s_addc_u32 s3, s7, 0 -; TONGA-NEXT: v_mov_b32_e32 v0, s2 -; TONGA-NEXT: v_mov_b32_e32 v1, s3 -; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; TONGA-NEXT: v_mov_b32_e32 v0, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 +; TONGA-NEXT: s_add_u32 s0, s6, 32 +; TONGA-NEXT: s_addc_u32 s1, s7, 0 +; TONGA-NEXT: v_mov_b32_e32 v3, s1 +; TONGA-NEXT: v_mov_b32_e32 v2, s0 ; TONGA-NEXT: s_add_u32 s0, s6, 16 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v5, s1 -; TONGA-NEXT: v_mov_b32_e32 v4, s0 +; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; TONGA-NEXT: v_mov_b32_e32 v4, s0 ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_readfirstlane_b32 s3, v15 @@ -6144,8 +6126,6 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s1, v11 ; TONGA-NEXT: v_readfirstlane_b32 s0, v10 ; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] -; TONGA-NEXT: s_mov_b32 s6, 0 -; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 ; TONGA-NEXT: s_cbranch_scc0 .LBB12_3 ; TONGA-NEXT: ; %bb.1: ; TONGA-NEXT: s_ashr_i32 s6, s1, 31 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 28c6b40554bb6..51aa8706abac2 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -731,12 +731,11 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX1032-NEXT: s_mov_b32 s8, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1] -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0 @@ -751,71 +750,71 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s5 -; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 -; GFX1032-NEXT: s_mul_i32 s12, s10, s8 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1032-NEXT: s_mul_i32 s11, s9, s4 +; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s5 +; GFX1032-NEXT: s_mul_i32 s12, s10, s5 ; GFX1032-NEXT: s_add_i32 s11, s13, s11 -; GFX1032-NEXT: s_mul_i32 s14, s9, s8 +; GFX1032-NEXT: s_mul_i32 s14, s9, s5 ; GFX1032-NEXT: s_add_i32 s11, s11, s12 -; GFX1032-NEXT: s_mul_hi_u32 s13, s8, s14 -; GFX1032-NEXT: s_mul_i32 s16, s8, s11 -; GFX1032-NEXT: s_mul_hi_u32 s15, s5, s14 -; GFX1032-NEXT: s_mul_i32 s12, s5, s14 -; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s14 +; GFX1032-NEXT: s_mul_i32 s16, s5, s11 +; GFX1032-NEXT: s_mul_hi_u32 s15, s4, s14 +; GFX1032-NEXT: s_mul_i32 s12, s4, s14 +; GFX1032-NEXT: s_mul_hi_u32 s14, s5, s11 ; GFX1032-NEXT: s_add_u32 s13, s13, s16 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s17, s5, s11 +; GFX1032-NEXT: s_mul_hi_u32 s17, s4, s11 ; GFX1032-NEXT: s_add_u32 s12, s13, s12 -; GFX1032-NEXT: s_mul_i32 s11, s5, s11 +; GFX1032-NEXT: s_mul_i32 s11, s4, s11 ; GFX1032-NEXT: s_addc_u32 s12, s14, s15 ; GFX1032-NEXT: s_addc_u32 s13, s17, 0 ; GFX1032-NEXT: s_add_u32 s11, s12, s11 ; GFX1032-NEXT: s_addc_u32 s12, 0, s13 -; GFX1032-NEXT: s_add_u32 s8, s8, s11 -; GFX1032-NEXT: s_addc_u32 s5, s5, s12 -; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8 -; GFX1032-NEXT: s_mul_i32 s12, s9, s8 -; GFX1032-NEXT: s_mul_i32 s9, s9, s5 -; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_add_u32 s5, s5, s11 +; GFX1032-NEXT: s_addc_u32 s4, s4, s12 +; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s5 +; GFX1032-NEXT: s_mul_i32 s12, s9, s5 +; GFX1032-NEXT: s_mul_i32 s9, s9, s4 +; GFX1032-NEXT: s_mul_i32 s10, s10, s5 ; GFX1032-NEXT: s_add_i32 s9, s11, s9 -; GFX1032-NEXT: s_mul_i32 s11, s5, s12 +; GFX1032-NEXT: s_mul_i32 s11, s4, s12 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12 -; GFX1032-NEXT: s_mul_i32 s15, s8, s9 -; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1032-NEXT: s_mul_hi_u32 s10, s5, s12 +; GFX1032-NEXT: s_mul_i32 s15, s5, s9 +; GFX1032-NEXT: s_mul_hi_u32 s14, s5, s9 ; GFX1032-NEXT: s_add_u32 s10, s10, s15 -; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12 +; GFX1032-NEXT: s_mul_hi_u32 s13, s4, s12 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9 +; GFX1032-NEXT: s_mul_hi_u32 s12, s4, s9 ; GFX1032-NEXT: s_add_u32 s10, s10, s11 -; GFX1032-NEXT: s_mul_i32 s9, s5, s9 +; GFX1032-NEXT: s_mul_i32 s9, s4, s9 ; GFX1032-NEXT: s_addc_u32 s10, s14, s13 ; GFX1032-NEXT: s_addc_u32 s11, s12, 0 ; GFX1032-NEXT: s_add_u32 s9, s10, s9 ; GFX1032-NEXT: s_addc_u32 s10, 0, s11 -; GFX1032-NEXT: s_add_u32 s8, s8, s9 -; GFX1032-NEXT: s_addc_u32 s5, s5, s10 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8 -; GFX1032-NEXT: s_mul_i32 s12, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5 -; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8 -; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_add_u32 s5, s5, s9 +; GFX1032-NEXT: s_addc_u32 s4, s4, s10 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s5 +; GFX1032-NEXT: s_mul_i32 s12, s2, s4 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s4 +; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s5 +; GFX1032-NEXT: s_mul_i32 s5, s3, s5 ; GFX1032-NEXT: s_add_u32 s9, s9, s12 ; GFX1032-NEXT: s_addc_u32 s11, 0, s11 -; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 -; GFX1032-NEXT: s_add_u32 s8, s9, s8 -; GFX1032-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032-NEXT: s_addc_u32 s8, s11, s10 +; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s4 +; GFX1032-NEXT: s_add_u32 s5, s9, s5 +; GFX1032-NEXT: s_mul_i32 s4, s3, s4 +; GFX1032-NEXT: s_addc_u32 s5, s11, s10 ; GFX1032-NEXT: s_addc_u32 s9, s13, 0 -; GFX1032-NEXT: s_add_u32 s5, s8, s5 -; GFX1032-NEXT: s_addc_u32 s8, 0, s9 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s5 -; GFX1032-NEXT: s_mul_i32 s10, s0, s8 -; GFX1032-NEXT: s_mul_i32 s11, s1, s5 -; GFX1032-NEXT: s_add_i32 s9, s9, s10 +; GFX1032-NEXT: s_add_u32 s4, s5, s4 +; GFX1032-NEXT: s_addc_u32 s5, 0, s9 +; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s4 ; GFX1032-NEXT: s_mul_i32 s10, s0, s5 +; GFX1032-NEXT: s_mul_i32 s11, s1, s4 +; GFX1032-NEXT: s_add_i32 s9, s9, s10 +; GFX1032-NEXT: s_mul_i32 s10, s0, s4 ; GFX1032-NEXT: s_add_i32 s9, s9, s11 ; GFX1032-NEXT: s_sub_i32 s11, s3, s9 ; GFX1032-NEXT: s_sub_u32 s10, s2, s10 @@ -829,10 +828,10 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1032-NEXT: s_cmp_eq_u32 s11, s1 ; GFX1032-NEXT: s_cselect_b32 s11, s13, s14 -; GFX1032-NEXT: s_add_u32 s13, s5, 1 -; GFX1032-NEXT: s_addc_u32 s14, s8, 0 -; GFX1032-NEXT: s_add_u32 s15, s5, 2 -; GFX1032-NEXT: s_addc_u32 s16, s8, 0 +; GFX1032-NEXT: s_add_u32 s13, s4, 1 +; GFX1032-NEXT: s_addc_u32 s14, s5, 0 +; GFX1032-NEXT: s_add_u32 s15, s4, 2 +; GFX1032-NEXT: s_addc_u32 s16, s5, 0 ; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1032-NEXT: s_cselect_b32 s11, s15, s13 ; GFX1032-NEXT: s_cselect_b32 s13, s16, s14 @@ -845,14 +844,14 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1032-NEXT: s_cselect_b32 s1, s10, s9 ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cselect_b32 s9, s13, s8 -; GFX1032-NEXT: s_cselect_b32 s8, s11, s5 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 +; GFX1032-NEXT: s_cselect_b32 s5, s13, s5 +; GFX1032-NEXT: s_cselect_b32 s4, s11, s4 +; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX1032-NEXT: .LBB15_2: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX1032-NEXT: s_sub_i32 s3, 0, s0 -; GFX1032-NEXT: s_mov_b32 s9, 0 +; GFX1032-NEXT: s_mov_b32 s5, 0 ; GFX1032-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1032-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -870,15 +869,15 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cselect_b32 s2, s4, s2 ; GFX1032-NEXT: s_add_i32 s3, s1, 1 ; GFX1032-NEXT: s_cmp_ge_u32 s2, s0 -; GFX1032-NEXT: s_cselect_b32 s8, s3, s1 +; GFX1032-NEXT: s_cselect_b32 s4, s3, s1 ; GFX1032-NEXT: .LBB15_3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s8 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s9 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1032-NEXT: s_endpgm ; GFX1032-NEXT: .LBB15_4: -; GFX1032-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX1032-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1032-NEXT: s_branch .LBB15_2 ; ; GFX1064-LABEL: test_udiv64: @@ -888,8 +887,6 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1] -; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 From 221099c4df28743439f69b82f5676d0b8da2bd4d Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 10 Nov 2025 18:17:48 -0500 Subject: [PATCH 6/7] Fix bug and update tests --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +- .../test/CodeGen/AMDGPU/carryout-selection.ll | 346 +++++++++--------- llvm/test/CodeGen/AMDGPU/srem.ll | 98 +++-- llvm/test/CodeGen/AMDGPU/wave32.ll | 115 +++--- 4 files changed, 302 insertions(+), 265 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 28967ebfb8a57..1c4177404f64c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1315,7 +1315,7 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, MachineInstr * SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const { - if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) + if (MI.getOpcode() != AMDGPU::REG_SEQUENCE || MI.getNumOperands() != 5) return nullptr; const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -1331,7 +1331,11 @@ SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const { } for (unsigned I : {0, 1}) - if (SubRegIsConst[I] && !SubRegValues[I]) + if (SubRegIsConst[I] && !SubRegValues[I] && + MRI.getRegClass(RealDefs[(I + 1) % 2]->getOperand(0).getReg()) + ->MC->getSizeInBits() * + 2 == + MRI.getRegClass(MI.getOperand(0).getReg())->MC->getSizeInBits()) return RealDefs[(I + 1) % 2]; return nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index e7de08537883f..8d05317162e9c 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -2116,6 +2116,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: s_cmp_lg_u64 s[6:7], 0 ; VI-NEXT: s_cbranch_scc0 .LBB16_3 ; VI-NEXT: ; %bb.1: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2266,6 +2268,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2414,9 +2418,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_clause 0x1 ; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1010-NEXT: s_mov_b32 s8, 0 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) ; GFX1010-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] +; GFX1010-NEXT: s_mov_b32 s4, 0 +; GFX1010-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1010-NEXT: ; %bb.1: ; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6 @@ -2431,71 +2436,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1010-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1010-NEXT: v_readfirstlane_b32 s5, v0 -; GFX1010-NEXT: s_mul_i32 s11, s9, s4 -; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s5 -; GFX1010-NEXT: s_mul_i32 s12, s10, s5 +; GFX1010-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1010-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1010-NEXT: s_mul_i32 s11, s9, s5 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1010-NEXT: s_mul_i32 s12, s10, s8 ; GFX1010-NEXT: s_add_i32 s11, s13, s11 -; GFX1010-NEXT: s_mul_i32 s14, s9, s5 +; GFX1010-NEXT: s_mul_i32 s14, s9, s8 ; GFX1010-NEXT: s_add_i32 s11, s11, s12 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s14 -; GFX1010-NEXT: s_mul_i32 s16, s5, s11 -; GFX1010-NEXT: s_mul_hi_u32 s15, s4, s14 -; GFX1010-NEXT: s_mul_i32 s12, s4, s14 -; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s11 +; GFX1010-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX1010-NEXT: s_mul_i32 s16, s8, s11 +; GFX1010-NEXT: s_mul_hi_u32 s15, s5, s14 +; GFX1010-NEXT: s_mul_i32 s12, s5, s14 +; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s11 ; GFX1010-NEXT: s_add_u32 s13, s13, s16 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s17, s4, s11 +; GFX1010-NEXT: s_mul_hi_u32 s17, s5, s11 ; GFX1010-NEXT: s_add_u32 s12, s13, s12 -; GFX1010-NEXT: s_mul_i32 s11, s4, s11 +; GFX1010-NEXT: s_mul_i32 s11, s5, s11 ; GFX1010-NEXT: s_addc_u32 s12, s14, s15 ; GFX1010-NEXT: s_addc_u32 s13, s17, 0 ; GFX1010-NEXT: s_add_u32 s11, s12, s11 ; GFX1010-NEXT: s_addc_u32 s12, 0, s13 -; GFX1010-NEXT: s_add_u32 s5, s5, s11 -; GFX1010-NEXT: s_addc_u32 s4, s4, s12 -; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s5 -; GFX1010-NEXT: s_mul_i32 s12, s9, s5 -; GFX1010-NEXT: s_mul_i32 s9, s9, s4 -; GFX1010-NEXT: s_mul_i32 s10, s10, s5 +; GFX1010-NEXT: s_add_u32 s8, s8, s11 +; GFX1010-NEXT: s_addc_u32 s5, s5, s12 +; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1010-NEXT: s_mul_i32 s12, s9, s8 +; GFX1010-NEXT: s_mul_i32 s9, s9, s5 +; GFX1010-NEXT: s_mul_i32 s10, s10, s8 ; GFX1010-NEXT: s_add_i32 s9, s11, s9 -; GFX1010-NEXT: s_mul_i32 s11, s4, s12 +; GFX1010-NEXT: s_mul_i32 s11, s5, s12 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_hi_u32 s10, s5, s12 -; GFX1010-NEXT: s_mul_i32 s15, s5, s9 -; GFX1010-NEXT: s_mul_hi_u32 s14, s5, s9 +; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1010-NEXT: s_mul_i32 s15, s8, s9 +; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 ; GFX1010-NEXT: s_add_u32 s10, s10, s15 -; GFX1010-NEXT: s_mul_hi_u32 s13, s4, s12 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1010-NEXT: s_addc_u32 s14, 0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9 ; GFX1010-NEXT: s_add_u32 s10, s10, s11 -; GFX1010-NEXT: s_mul_i32 s9, s4, s9 +; GFX1010-NEXT: s_mul_i32 s9, s5, s9 ; GFX1010-NEXT: s_addc_u32 s10, s14, s13 ; GFX1010-NEXT: s_addc_u32 s11, s12, 0 ; GFX1010-NEXT: s_add_u32 s9, s10, s9 ; GFX1010-NEXT: s_addc_u32 s10, 0, s11 -; GFX1010-NEXT: s_add_u32 s5, s5, s9 -; GFX1010-NEXT: s_addc_u32 s4, s4, s10 -; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s5 -; GFX1010-NEXT: s_mul_i32 s12, s2, s4 -; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s4 -; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s5 -; GFX1010-NEXT: s_mul_i32 s5, s3, s5 +; GFX1010-NEXT: s_add_u32 s8, s8, s9 +; GFX1010-NEXT: s_addc_u32 s5, s5, s10 +; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX1010-NEXT: s_mul_i32 s12, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1010-NEXT: s_mul_i32 s8, s3, s8 ; GFX1010-NEXT: s_add_u32 s9, s9, s12 ; GFX1010-NEXT: s_addc_u32 s11, 0, s11 -; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s4 -; GFX1010-NEXT: s_add_u32 s5, s9, s5 -; GFX1010-NEXT: s_mul_i32 s4, s3, s4 -; GFX1010-NEXT: s_addc_u32 s5, s11, s10 +; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 +; GFX1010-NEXT: s_add_u32 s8, s9, s8 +; GFX1010-NEXT: s_mul_i32 s5, s3, s5 +; GFX1010-NEXT: s_addc_u32 s8, s11, s10 ; GFX1010-NEXT: s_addc_u32 s9, s13, 0 -; GFX1010-NEXT: s_add_u32 s4, s5, s4 -; GFX1010-NEXT: s_addc_u32 s5, 0, s9 -; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s4 -; GFX1010-NEXT: s_mul_i32 s10, s6, s5 -; GFX1010-NEXT: s_mul_i32 s11, s7, s4 +; GFX1010-NEXT: s_add_u32 s5, s8, s5 +; GFX1010-NEXT: s_addc_u32 s8, 0, s9 +; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s5 +; GFX1010-NEXT: s_mul_i32 s10, s6, s8 +; GFX1010-NEXT: s_mul_i32 s11, s7, s5 ; GFX1010-NEXT: s_add_i32 s9, s9, s10 -; GFX1010-NEXT: s_mul_i32 s10, s6, s4 +; GFX1010-NEXT: s_mul_i32 s10, s6, s5 ; GFX1010-NEXT: s_add_i32 s9, s9, s11 ; GFX1010-NEXT: s_sub_i32 s11, s3, s9 ; GFX1010-NEXT: s_sub_u32 s10, s2, s10 @@ -2509,10 +2514,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1010-NEXT: s_cmp_eq_u32 s11, s7 ; GFX1010-NEXT: s_cselect_b32 s11, s13, s14 -; GFX1010-NEXT: s_add_u32 s13, s4, 1 -; GFX1010-NEXT: s_addc_u32 s14, s5, 0 -; GFX1010-NEXT: s_add_u32 s15, s4, 2 -; GFX1010-NEXT: s_addc_u32 s16, s5, 0 +; GFX1010-NEXT: s_add_u32 s13, s5, 1 +; GFX1010-NEXT: s_addc_u32 s14, s8, 0 +; GFX1010-NEXT: s_add_u32 s15, s5, 2 +; GFX1010-NEXT: s_addc_u32 s16, s8, 0 ; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1010-NEXT: s_cselect_b32 s11, s15, s13 ; GFX1010-NEXT: s_cselect_b32 s13, s16, s14 @@ -2525,13 +2530,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cmp_eq_u32 s3, s7 ; GFX1010-NEXT: s_cselect_b32 s3, s10, s9 ; GFX1010-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1010-NEXT: s_cselect_b32 s5, s13, s5 -; GFX1010-NEXT: s_cselect_b32 s4, s11, s4 -; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX1010-NEXT: s_cselect_b32 s9, s13, s8 +; GFX1010-NEXT: s_cselect_b32 s8, s11, s5 +; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: ; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX1010-NEXT: s_sub_i32 s4, 0, s6 +; GFX1010-NEXT: s_mov_b32 s9, 0 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2549,16 +2555,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cselect_b32 s2, s5, s2 ; GFX1010-NEXT: s_add_i32 s4, s3, 1 ; GFX1010-NEXT: s_cmp_ge_u32 s2, s6 -; GFX1010-NEXT: s_mov_b32 s5, 0 -; GFX1010-NEXT: s_cselect_b32 s4, s4, s3 +; GFX1010-NEXT: s_cselect_b32 s8, s4, s3 ; GFX1010-NEXT: .LBB16_3: -; GFX1010-NEXT: v_mov_b32_e32 v0, s4 +; GFX1010-NEXT: v_mov_b32_e32 v0, s8 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s5 +; GFX1010-NEXT: v_mov_b32_e32 v1, s9 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; GFX1010-NEXT: .LBB16_4: -; GFX1010-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX1010-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1010-NEXT: s_branch .LBB16_2 ; ; GFX1030W32-LABEL: sudiv64: @@ -2566,9 +2571,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_clause 0x1 ; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX1030W32-NEXT: s_mov_b32 s8, 0 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W32-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX1030W32-NEXT: s_mov_b32 s6, 0 +; GFX1030W32-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W32-NEXT: ; %bb.1: ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2583,71 +2589,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W32-NEXT: v_readfirstlane_b32 s6, v1 -; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v0 -; GFX1030W32-NEXT: s_mul_i32 s11, s9, s6 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s7 -; GFX1030W32-NEXT: s_mul_i32 s12, s10, s7 +; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1030W32-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1030W32-NEXT: s_mul_i32 s11, s9, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s10, s8 ; GFX1030W32-NEXT: s_add_i32 s11, s13, s11 -; GFX1030W32-NEXT: s_mul_i32 s14, s9, s7 +; GFX1030W32-NEXT: s_mul_i32 s14, s9, s8 ; GFX1030W32-NEXT: s_add_i32 s11, s11, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s14 -; GFX1030W32-NEXT: s_mul_i32 s16, s7, s11 -; GFX1030W32-NEXT: s_mul_hi_u32 s15, s6, s14 -; GFX1030W32-NEXT: s_mul_i32 s12, s6, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX1030W32-NEXT: s_mul_i32 s16, s8, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s15, s7, s14 +; GFX1030W32-NEXT: s_mul_i32 s12, s7, s14 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s11 ; GFX1030W32-NEXT: s_add_u32 s13, s13, s16 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s17, s6, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s17, s7, s11 ; GFX1030W32-NEXT: s_add_u32 s12, s13, s12 -; GFX1030W32-NEXT: s_mul_i32 s11, s6, s11 +; GFX1030W32-NEXT: s_mul_i32 s11, s7, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, s14, s15 ; GFX1030W32-NEXT: s_addc_u32 s13, s17, 0 ; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 ; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 -; GFX1030W32-NEXT: s_add_u32 s7, s7, s11 -; GFX1030W32-NEXT: s_addc_u32 s6, s6, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s7 -; GFX1030W32-NEXT: s_mul_i32 s12, s9, s7 -; GFX1030W32-NEXT: s_mul_i32 s9, s9, s6 -; GFX1030W32-NEXT: s_mul_i32 s10, s10, s7 +; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 ; GFX1030W32-NEXT: s_add_i32 s9, s11, s9 -; GFX1030W32-NEXT: s_mul_i32 s11, s6, s12 +; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s7, s12 -; GFX1030W32-NEXT: s_mul_i32 s15, s7, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s7, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 ; GFX1030W32-NEXT: s_add_u32 s10, s10, s15 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s6, s12 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s6, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9 ; GFX1030W32-NEXT: s_add_u32 s10, s10, s11 -; GFX1030W32-NEXT: s_mul_i32 s9, s6, s9 +; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 ; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0 ; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 ; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 -; GFX1030W32-NEXT: s_add_u32 s7, s7, s9 -; GFX1030W32-NEXT: s_addc_u32 s6, s6, s10 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s7 -; GFX1030W32-NEXT: s_mul_i32 s12, s2, s6 -; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s6 -; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s7 -; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 +; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 +; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 ; GFX1030W32-NEXT: s_add_u32 s9, s9, s12 ; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s6 -; GFX1030W32-NEXT: s_add_u32 s7, s9, s7 -; GFX1030W32-NEXT: s_mul_i32 s6, s3, s6 -; GFX1030W32-NEXT: s_addc_u32 s7, s11, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 +; GFX1030W32-NEXT: s_add_u32 s8, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 +; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10 ; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 -; GFX1030W32-NEXT: s_add_u32 s6, s7, s6 -; GFX1030W32-NEXT: s_addc_u32 s7, 0, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s6 -; GFX1030W32-NEXT: s_mul_i32 s10, s4, s7 -; GFX1030W32-NEXT: s_mul_i32 s11, s5, s6 +; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 +; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s7 +; GFX1030W32-NEXT: s_mul_i32 s10, s4, s8 +; GFX1030W32-NEXT: s_mul_i32 s11, s5, s7 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 -; GFX1030W32-NEXT: s_mul_i32 s10, s4, s6 +; GFX1030W32-NEXT: s_mul_i32 s10, s4, s7 ; GFX1030W32-NEXT: s_add_i32 s9, s9, s11 ; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 ; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 @@ -2661,10 +2667,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1030W32-NEXT: s_cmp_eq_u32 s11, s5 ; GFX1030W32-NEXT: s_cselect_b32 s11, s13, s14 -; GFX1030W32-NEXT: s_add_u32 s13, s6, 1 -; GFX1030W32-NEXT: s_addc_u32 s14, s7, 0 -; GFX1030W32-NEXT: s_add_u32 s15, s6, 2 -; GFX1030W32-NEXT: s_addc_u32 s16, s7, 0 +; GFX1030W32-NEXT: s_add_u32 s13, s7, 1 +; GFX1030W32-NEXT: s_addc_u32 s14, s8, 0 +; GFX1030W32-NEXT: s_add_u32 s15, s7, 2 +; GFX1030W32-NEXT: s_addc_u32 s16, s8, 0 ; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1030W32-NEXT: s_cselect_b32 s11, s15, s13 ; GFX1030W32-NEXT: s_cselect_b32 s13, s16, s14 @@ -2677,14 +2683,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cmp_eq_u32 s3, s5 ; GFX1030W32-NEXT: s_cselect_b32 s3, s10, s9 ; GFX1030W32-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1030W32-NEXT: s_cselect_b32 s7, s13, s7 -; GFX1030W32-NEXT: s_cselect_b32 s6, s11, s6 -; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX1030W32-NEXT: s_cselect_b32 s9, s13, s8 +; GFX1030W32-NEXT: s_cselect_b32 s8, s11, s7 +; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 ; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1030W32-NEXT: .LBB16_2: ; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX1030W32-NEXT: s_sub_i32 s5, 0, s4 -; GFX1030W32-NEXT: s_mov_b32 s7, 0 +; GFX1030W32-NEXT: s_mov_b32 s9, 0 ; GFX1030W32-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2702,15 +2708,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cselect_b32 s2, s6, s2 ; GFX1030W32-NEXT: s_add_i32 s5, s3, 1 ; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4 -; GFX1030W32-NEXT: s_cselect_b32 s6, s5, s3 +; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3 ; GFX1030W32-NEXT: .LBB16_3: -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7 +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm ; GFX1030W32-NEXT: .LBB16_4: -; GFX1030W32-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GFX1030W32-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1030W32-NEXT: s_branch .LBB16_2 ; ; GFX1030W64-LABEL: sudiv64: @@ -2720,6 +2726,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030W64-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX1030W64-NEXT: s_mov_b32 s6, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2868,9 +2876,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 @@ -2890,71 +2900,71 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s6, v1 -; GFX11-NEXT: v_readfirstlane_b32 s7, v0 -; GFX11-NEXT: s_mul_i32 s11, s9, s6 -; GFX11-NEXT: s_mul_hi_u32 s13, s9, s7 -; GFX11-NEXT: s_mul_i32 s12, s10, s7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-NEXT: v_readfirstlane_b32 s8, v0 +; GFX11-NEXT: s_mul_i32 s11, s9, s7 +; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX11-NEXT: s_mul_i32 s12, s10, s8 ; GFX11-NEXT: s_add_i32 s11, s13, s11 -; GFX11-NEXT: s_mul_i32 s14, s9, s7 +; GFX11-NEXT: s_mul_i32 s14, s9, s8 ; GFX11-NEXT: s_add_i32 s11, s11, s12 -; GFX11-NEXT: s_mul_hi_u32 s13, s7, s14 -; GFX11-NEXT: s_mul_i32 s16, s7, s11 -; GFX11-NEXT: s_mul_hi_u32 s15, s6, s14 -; GFX11-NEXT: s_mul_i32 s12, s6, s14 -; GFX11-NEXT: s_mul_hi_u32 s14, s7, s11 +; GFX11-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX11-NEXT: s_mul_i32 s16, s8, s11 +; GFX11-NEXT: s_mul_hi_u32 s15, s7, s14 +; GFX11-NEXT: s_mul_i32 s12, s7, s14 +; GFX11-NEXT: s_mul_hi_u32 s14, s8, s11 ; GFX11-NEXT: s_add_u32 s13, s13, s16 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s17, s6, s11 +; GFX11-NEXT: s_mul_hi_u32 s17, s7, s11 ; GFX11-NEXT: s_add_u32 s12, s13, s12 -; GFX11-NEXT: s_mul_i32 s11, s6, s11 +; GFX11-NEXT: s_mul_i32 s11, s7, s11 ; GFX11-NEXT: s_addc_u32 s12, s14, s15 ; GFX11-NEXT: s_addc_u32 s13, s17, 0 ; GFX11-NEXT: s_add_u32 s11, s12, s11 ; GFX11-NEXT: s_addc_u32 s12, 0, s13 -; GFX11-NEXT: s_add_u32 s7, s7, s11 -; GFX11-NEXT: s_addc_u32 s6, s6, s12 -; GFX11-NEXT: s_mul_hi_u32 s11, s9, s7 -; GFX11-NEXT: s_mul_i32 s12, s9, s7 -; GFX11-NEXT: s_mul_i32 s9, s9, s6 -; GFX11-NEXT: s_mul_i32 s10, s10, s7 +; GFX11-NEXT: s_add_u32 s8, s8, s11 +; GFX11-NEXT: s_addc_u32 s7, s7, s12 +; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX11-NEXT: s_mul_i32 s12, s9, s8 +; GFX11-NEXT: s_mul_i32 s9, s9, s7 +; GFX11-NEXT: s_mul_i32 s10, s10, s8 ; GFX11-NEXT: s_add_i32 s9, s11, s9 -; GFX11-NEXT: s_mul_i32 s11, s6, s12 +; GFX11-NEXT: s_mul_i32 s11, s7, s12 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_hi_u32 s10, s7, s12 -; GFX11-NEXT: s_mul_i32 s15, s7, s9 -; GFX11-NEXT: s_mul_hi_u32 s14, s7, s9 +; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX11-NEXT: s_mul_i32 s15, s8, s9 +; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 ; GFX11-NEXT: s_add_u32 s10, s10, s15 -; GFX11-NEXT: s_mul_hi_u32 s13, s6, s12 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12 ; GFX11-NEXT: s_addc_u32 s14, 0, s14 -; GFX11-NEXT: s_mul_hi_u32 s12, s6, s9 +; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9 ; GFX11-NEXT: s_add_u32 s10, s10, s11 -; GFX11-NEXT: s_mul_i32 s9, s6, s9 +; GFX11-NEXT: s_mul_i32 s9, s7, s9 ; GFX11-NEXT: s_addc_u32 s10, s14, s13 ; GFX11-NEXT: s_addc_u32 s11, s12, 0 ; GFX11-NEXT: s_add_u32 s9, s10, s9 ; GFX11-NEXT: s_addc_u32 s10, 0, s11 -; GFX11-NEXT: s_add_u32 s7, s7, s9 -; GFX11-NEXT: s_addc_u32 s6, s6, s10 -; GFX11-NEXT: s_mul_hi_u32 s9, s2, s7 -; GFX11-NEXT: s_mul_i32 s12, s2, s6 -; GFX11-NEXT: s_mul_hi_u32 s11, s2, s6 -; GFX11-NEXT: s_mul_hi_u32 s10, s3, s7 -; GFX11-NEXT: s_mul_i32 s7, s3, s7 +; GFX11-NEXT: s_add_u32 s8, s8, s9 +; GFX11-NEXT: s_addc_u32 s7, s7, s10 +; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX11-NEXT: s_mul_i32 s12, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX11-NEXT: s_mul_i32 s8, s3, s8 ; GFX11-NEXT: s_add_u32 s9, s9, s12 ; GFX11-NEXT: s_addc_u32 s11, 0, s11 -; GFX11-NEXT: s_mul_hi_u32 s13, s3, s6 -; GFX11-NEXT: s_add_u32 s7, s9, s7 -; GFX11-NEXT: s_mul_i32 s6, s3, s6 -; GFX11-NEXT: s_addc_u32 s7, s11, s10 +; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 +; GFX11-NEXT: s_add_u32 s8, s9, s8 +; GFX11-NEXT: s_mul_i32 s7, s3, s7 +; GFX11-NEXT: s_addc_u32 s8, s11, s10 ; GFX11-NEXT: s_addc_u32 s9, s13, 0 -; GFX11-NEXT: s_add_u32 s6, s7, s6 -; GFX11-NEXT: s_addc_u32 s7, 0, s9 -; GFX11-NEXT: s_mul_hi_u32 s9, s4, s6 -; GFX11-NEXT: s_mul_i32 s10, s4, s7 -; GFX11-NEXT: s_mul_i32 s11, s5, s6 +; GFX11-NEXT: s_add_u32 s7, s8, s7 +; GFX11-NEXT: s_addc_u32 s8, 0, s9 +; GFX11-NEXT: s_mul_hi_u32 s9, s4, s7 +; GFX11-NEXT: s_mul_i32 s10, s4, s8 +; GFX11-NEXT: s_mul_i32 s11, s5, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s10 -; GFX11-NEXT: s_mul_i32 s10, s4, s6 +; GFX11-NEXT: s_mul_i32 s10, s4, s7 ; GFX11-NEXT: s_add_i32 s9, s9, s11 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_sub_i32 s11, s3, s9 @@ -2970,10 +2980,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cselect_b32 s13, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s11, s5 ; GFX11-NEXT: s_cselect_b32 s11, s13, s14 -; GFX11-NEXT: s_add_u32 s13, s6, 1 -; GFX11-NEXT: s_addc_u32 s14, s7, 0 -; GFX11-NEXT: s_add_u32 s15, s6, 2 -; GFX11-NEXT: s_addc_u32 s16, s7, 0 +; GFX11-NEXT: s_add_u32 s13, s7, 1 +; GFX11-NEXT: s_addc_u32 s14, s8, 0 +; GFX11-NEXT: s_add_u32 s15, s7, 2 +; GFX11-NEXT: s_addc_u32 s16, s8, 0 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: s_cselect_b32 s11, s15, s13 ; GFX11-NEXT: s_cselect_b32 s13, s16, s14 @@ -2988,14 +2998,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cselect_b32 s3, s10, s9 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s3, 0 -; GFX11-NEXT: s_cselect_b32 s7, s13, s7 -; GFX11-NEXT: s_cselect_b32 s6, s11, s6 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cselect_b32 s9, s13, s8 +; GFX11-NEXT: s_cselect_b32 s8, s11, s7 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX11-NEXT: .LBB16_2: ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX11-NEXT: s_sub_i32 s5, 0, s4 -; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_mov_b32 s9, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -3018,15 +3028,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cselect_b32 s2, s6, s2 ; GFX11-NEXT: s_add_i32 s5, s3, 1 ; GFX11-NEXT: s_cmp_ge_u32 s2, s4 -; GFX11-NEXT: s_cselect_b32 s6, s5, s3 +; GFX11-NEXT: s_cselect_b32 s8, s5, s3 ; GFX11-NEXT: .LBB16_3: ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: -; GFX11-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GFX11-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX11-NEXT: s_branch .LBB16_2 ; ; GFX1250-LABEL: sudiv64: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 3c3d634c96410..e12e31b14e97d 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1501,6 +1501,8 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB8_4 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_ashr_i32 s6, s3, 31 @@ -1830,6 +1832,8 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_readfirstlane_b32 s3, v3 ; TONGA-NEXT: v_readfirstlane_b32 s2, v2 ; TONGA-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; TONGA-NEXT: s_mov_b32 s6, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 ; TONGA-NEXT: s_cbranch_scc0 .LBB8_3 ; TONGA-NEXT: ; %bb.1: ; TONGA-NEXT: s_ashr_i32 s6, s3, 31 @@ -2697,10 +2701,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s11, v5 ; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[8:9] +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-NEXT: v_readfirstlane_b32 s5, v7 -; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[8:9] +; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GCN-NEXT: v_readfirstlane_b32 s4, v6 ; GCN-NEXT: s_cbranch_scc0 .LBB10_6 ; GCN-NEXT: ; %bb.1: @@ -2849,6 +2855,8 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s6, s9, s6 ; GCN-NEXT: .LBB10_3: ; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB10_7 ; GCN-NEXT: ; %bb.4: ; GCN-NEXT: s_ashr_i32 s8, s3, 31 @@ -3336,6 +3344,8 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s3, v5 ; TONGA-NEXT: v_readfirstlane_b32 s2, v4 ; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] +; TONGA-NEXT: s_mov_b32 s6, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 ; TONGA-NEXT: s_cbranch_scc0 .LBB10_3 ; TONGA-NEXT: ; %bb.1: ; TONGA-NEXT: s_ashr_i32 s6, s1, 31 @@ -4868,6 +4878,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s19, v13 ; GCN-NEXT: v_readfirstlane_b32 s18, v12 +; GCN-NEXT: s_or_b64 s[6:7], s[18:19], s[16:17] +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 @@ -4878,9 +4890,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_readfirstlane_b32 s11, v9 ; GCN-NEXT: v_readfirstlane_b32 s10, v8 ; GCN-NEXT: v_readfirstlane_b32 s15, v15 -; GCN-NEXT: s_or_b64 s[6:7], s[18:19], s[16:17] +; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GCN-NEXT: v_readfirstlane_b32 s14, v14 -; GCN-NEXT: s_cbranch_scc0 .LBB12_10 +; GCN-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_ashr_i32 s6, s17, 31 ; GCN-NEXT: s_add_u32 s20, s16, s6 @@ -5027,7 +5039,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_cselect_b32 s6, s17, s6 ; GCN-NEXT: .LBB12_3: ; GCN-NEXT: s_or_b64 s[16:17], s[14:15], s[12:13] -; GCN-NEXT: s_cbranch_scc0 .LBB12_11 +; GCN-NEXT: s_mov_b32 s16, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_cbranch_scc0 .LBB12_7 ; GCN-NEXT: ; %bb.4: ; GCN-NEXT: s_ashr_i32 s16, s13, 31 ; GCN-NEXT: s_add_u32 s18, s12, s16 @@ -5151,7 +5165,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b64 s[18:19], s[18:19], s[20:21] ; GCN-NEXT: s_sub_u32 s18, s18, s20 ; GCN-NEXT: s_subb_u32 s19, s19, s20 -; GCN-NEXT: s_cbranch_execnz .LBB12_12 +; GCN-NEXT: s_cbranch_execnz .LBB12_8 ; GCN-NEXT: .LBB12_5: ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: s_sub_i32 s13, 0, s12 @@ -5171,35 +5185,22 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subrev_u32_e32 v1, s12, v0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-NEXT: s_or_b64 s[12:13], s[10:11], s[8:9] -; GCN-NEXT: s_cbranch_scc1 .LBB12_13 +; GCN-NEXT: s_branch .LBB12_9 ; GCN-NEXT: .LBB12_6: -; GCN-NEXT: ; implicit-def: $sgpr14_sgpr15 -; GCN-NEXT: s_branch .LBB12_14 -; GCN-NEXT: .LBB12_7: -; GCN-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] -; GCN-NEXT: s_cbranch_scc1 .LBB12_15 -; GCN-NEXT: .LBB12_8: -; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11 -; GCN-NEXT: s_branch .LBB12_16 -; GCN-NEXT: .LBB12_9: -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: s_branch .LBB12_17 -; GCN-NEXT: .LBB12_10: ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NEXT: s_branch .LBB12_2 -; GCN-NEXT: .LBB12_11: +; GCN-NEXT: .LBB12_7: ; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_branch .LBB12_5 -; GCN-NEXT: .LBB12_12: +; GCN-NEXT: .LBB12_8: ; GCN-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: .LBB12_9: ; GCN-NEXT: s_or_b64 s[12:13], s[10:11], s[8:9] -; GCN-NEXT: s_cbranch_scc0 .LBB12_6 -; GCN-NEXT: .LBB12_13: +; GCN-NEXT: s_mov_b32 s12, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_cbranch_scc0 .LBB12_12 +; GCN-NEXT: ; %bb.10: ; GCN-NEXT: s_ashr_i32 s12, s9, 31 ; GCN-NEXT: s_add_u32 s14, s8, s12 ; GCN-NEXT: s_mov_b32 s13, s12 @@ -5322,8 +5323,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b64 s[14:15], s[14:15], s[16:17] ; GCN-NEXT: s_sub_u32 s14, s14, s16 ; GCN-NEXT: s_subb_u32 s15, s15, s16 -; GCN-NEXT: s_cbranch_execnz .LBB12_7 -; GCN-NEXT: .LBB12_14: +; GCN-NEXT: s_cbranch_execnz .LBB12_13 +; GCN-NEXT: .LBB12_11: ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_sub_i32 s9, 0, s8 ; GCN-NEXT: v_mov_b32_e32 v5, 0 @@ -5342,9 +5343,19 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subrev_u32_e32 v1, s8, v0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GCN-NEXT: s_branch .LBB12_14 +; GCN-NEXT: .LBB12_12: +; GCN-NEXT: ; implicit-def: $sgpr14_sgpr15 +; GCN-NEXT: s_branch .LBB12_11 +; GCN-NEXT: .LBB12_13: +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NEXT: .LBB12_14: ; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] -; GCN-NEXT: s_cbranch_scc0 .LBB12_8 -; GCN-NEXT: .LBB12_15: +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GCN-NEXT: s_cbranch_scc0 .LBB12_17 +; GCN-NEXT: ; %bb.15: ; GCN-NEXT: s_ashr_i32 s8, s3, 31 ; GCN-NEXT: s_add_u32 s10, s2, s8 ; GCN-NEXT: s_mov_b32 s9, s8 @@ -5467,7 +5478,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] ; GCN-NEXT: s_sub_u32 s10, s10, s12 ; GCN-NEXT: s_subb_u32 s11, s11, s12 -; GCN-NEXT: s_cbranch_execnz .LBB12_9 +; GCN-NEXT: s_cbranch_execnz .LBB12_18 ; GCN-NEXT: .LBB12_16: ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_sub_i32 s3, 0, s2 @@ -5487,7 +5498,14 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_subrev_u32_e32 v1, s2, v0 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; GCN-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GCN-NEXT: s_branch .LBB12_19 ; GCN-NEXT: .LBB12_17: +; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GCN-NEXT: s_branch .LBB12_16 +; GCN-NEXT: .LBB12_18: +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: .LBB12_19: ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 @@ -6101,23 +6119,23 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: v_mov_b32_e32 v0, s6 ; TONGA-NEXT: s_add_u32 s0, s6, 48 -; TONGA-NEXT: v_mov_b32_e32 v1, s7 +; TONGA-NEXT: v_mov_b32_e32 v0, s6 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 +; TONGA-NEXT: v_mov_b32_e32 v1, s7 +; TONGA-NEXT: s_add_u32 s2, s6, 32 ; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; TONGA-NEXT: s_addc_u32 s3, s7, 0 +; TONGA-NEXT: v_mov_b32_e32 v0, s2 +; TONGA-NEXT: v_mov_b32_e32 v1, s3 +; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; TONGA-NEXT: v_mov_b32_e32 v0, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 -; TONGA-NEXT: s_add_u32 s0, s6, 32 -; TONGA-NEXT: s_addc_u32 s1, s7, 0 -; TONGA-NEXT: v_mov_b32_e32 v3, s1 -; TONGA-NEXT: v_mov_b32_e32 v2, s0 ; TONGA-NEXT: s_add_u32 s0, s6, 16 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v5, s1 -; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: v_mov_b32_e32 v4, s0 +; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_readfirstlane_b32 s3, v15 @@ -6126,6 +6144,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_readfirstlane_b32 s1, v11 ; TONGA-NEXT: v_readfirstlane_b32 s0, v10 ; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] +; TONGA-NEXT: s_mov_b32 s6, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 ; TONGA-NEXT: s_cbranch_scc0 .LBB12_3 ; TONGA-NEXT: ; %bb.1: ; TONGA-NEXT: s_ashr_i32 s6, s1, 31 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 51aa8706abac2..28c6b40554bb6 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -731,11 +731,12 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX1032-NEXT: s_mov_b32 s8, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1] +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0 @@ -750,71 +751,71 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s5, v0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s4 -; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s5 -; GFX1032-NEXT: s_mul_i32 s12, s10, s5 +; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1032-NEXT: s_mul_i32 s11, s9, s5 +; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1032-NEXT: s_mul_i32 s12, s10, s8 ; GFX1032-NEXT: s_add_i32 s11, s13, s11 -; GFX1032-NEXT: s_mul_i32 s14, s9, s5 +; GFX1032-NEXT: s_mul_i32 s14, s9, s8 ; GFX1032-NEXT: s_add_i32 s11, s11, s12 -; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s14 -; GFX1032-NEXT: s_mul_i32 s16, s5, s11 -; GFX1032-NEXT: s_mul_hi_u32 s15, s4, s14 -; GFX1032-NEXT: s_mul_i32 s12, s4, s14 -; GFX1032-NEXT: s_mul_hi_u32 s14, s5, s11 +; GFX1032-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX1032-NEXT: s_mul_i32 s16, s8, s11 +; GFX1032-NEXT: s_mul_hi_u32 s15, s5, s14 +; GFX1032-NEXT: s_mul_i32 s12, s5, s14 +; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s11 ; GFX1032-NEXT: s_add_u32 s13, s13, s16 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s17, s4, s11 +; GFX1032-NEXT: s_mul_hi_u32 s17, s5, s11 ; GFX1032-NEXT: s_add_u32 s12, s13, s12 -; GFX1032-NEXT: s_mul_i32 s11, s4, s11 +; GFX1032-NEXT: s_mul_i32 s11, s5, s11 ; GFX1032-NEXT: s_addc_u32 s12, s14, s15 ; GFX1032-NEXT: s_addc_u32 s13, s17, 0 ; GFX1032-NEXT: s_add_u32 s11, s12, s11 ; GFX1032-NEXT: s_addc_u32 s12, 0, s13 -; GFX1032-NEXT: s_add_u32 s5, s5, s11 -; GFX1032-NEXT: s_addc_u32 s4, s4, s12 -; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s5 -; GFX1032-NEXT: s_mul_i32 s12, s9, s5 -; GFX1032-NEXT: s_mul_i32 s9, s9, s4 -; GFX1032-NEXT: s_mul_i32 s10, s10, s5 +; GFX1032-NEXT: s_add_u32 s8, s8, s11 +; GFX1032-NEXT: s_addc_u32 s5, s5, s12 +; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8 +; GFX1032-NEXT: s_mul_i32 s12, s9, s8 +; GFX1032-NEXT: s_mul_i32 s9, s9, s5 +; GFX1032-NEXT: s_mul_i32 s10, s10, s8 ; GFX1032-NEXT: s_add_i32 s9, s11, s9 -; GFX1032-NEXT: s_mul_i32 s11, s4, s12 +; GFX1032-NEXT: s_mul_i32 s11, s5, s12 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_hi_u32 s10, s5, s12 -; GFX1032-NEXT: s_mul_i32 s15, s5, s9 -; GFX1032-NEXT: s_mul_hi_u32 s14, s5, s9 +; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX1032-NEXT: s_mul_i32 s15, s8, s9 +; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 ; GFX1032-NEXT: s_add_u32 s10, s10, s15 -; GFX1032-NEXT: s_mul_hi_u32 s13, s4, s12 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12 ; GFX1032-NEXT: s_addc_u32 s14, 0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s12, s4, s9 +; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9 ; GFX1032-NEXT: s_add_u32 s10, s10, s11 -; GFX1032-NEXT: s_mul_i32 s9, s4, s9 +; GFX1032-NEXT: s_mul_i32 s9, s5, s9 ; GFX1032-NEXT: s_addc_u32 s10, s14, s13 ; GFX1032-NEXT: s_addc_u32 s11, s12, 0 ; GFX1032-NEXT: s_add_u32 s9, s10, s9 ; GFX1032-NEXT: s_addc_u32 s10, 0, s11 -; GFX1032-NEXT: s_add_u32 s5, s5, s9 -; GFX1032-NEXT: s_addc_u32 s4, s4, s10 -; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s5 -; GFX1032-NEXT: s_mul_i32 s12, s2, s4 -; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s4 -; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s5 -; GFX1032-NEXT: s_mul_i32 s5, s3, s5 +; GFX1032-NEXT: s_add_u32 s8, s8, s9 +; GFX1032-NEXT: s_addc_u32 s5, s5, s10 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX1032-NEXT: s_mul_i32 s12, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX1032-NEXT: s_mul_i32 s8, s3, s8 ; GFX1032-NEXT: s_add_u32 s9, s9, s12 ; GFX1032-NEXT: s_addc_u32 s11, 0, s11 -; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s4 -; GFX1032-NEXT: s_add_u32 s5, s9, s5 -; GFX1032-NEXT: s_mul_i32 s4, s3, s4 -; GFX1032-NEXT: s_addc_u32 s5, s11, s10 +; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 +; GFX1032-NEXT: s_add_u32 s8, s9, s8 +; GFX1032-NEXT: s_mul_i32 s5, s3, s5 +; GFX1032-NEXT: s_addc_u32 s8, s11, s10 ; GFX1032-NEXT: s_addc_u32 s9, s13, 0 -; GFX1032-NEXT: s_add_u32 s4, s5, s4 -; GFX1032-NEXT: s_addc_u32 s5, 0, s9 -; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s4 -; GFX1032-NEXT: s_mul_i32 s10, s0, s5 -; GFX1032-NEXT: s_mul_i32 s11, s1, s4 +; GFX1032-NEXT: s_add_u32 s5, s8, s5 +; GFX1032-NEXT: s_addc_u32 s8, 0, s9 +; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX1032-NEXT: s_mul_i32 s10, s0, s8 +; GFX1032-NEXT: s_mul_i32 s11, s1, s5 ; GFX1032-NEXT: s_add_i32 s9, s9, s10 -; GFX1032-NEXT: s_mul_i32 s10, s0, s4 +; GFX1032-NEXT: s_mul_i32 s10, s0, s5 ; GFX1032-NEXT: s_add_i32 s9, s9, s11 ; GFX1032-NEXT: s_sub_i32 s11, s3, s9 ; GFX1032-NEXT: s_sub_u32 s10, s2, s10 @@ -828,10 +829,10 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1032-NEXT: s_cmp_eq_u32 s11, s1 ; GFX1032-NEXT: s_cselect_b32 s11, s13, s14 -; GFX1032-NEXT: s_add_u32 s13, s4, 1 -; GFX1032-NEXT: s_addc_u32 s14, s5, 0 -; GFX1032-NEXT: s_add_u32 s15, s4, 2 -; GFX1032-NEXT: s_addc_u32 s16, s5, 0 +; GFX1032-NEXT: s_add_u32 s13, s5, 1 +; GFX1032-NEXT: s_addc_u32 s14, s8, 0 +; GFX1032-NEXT: s_add_u32 s15, s5, 2 +; GFX1032-NEXT: s_addc_u32 s16, s8, 0 ; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 ; GFX1032-NEXT: s_cselect_b32 s11, s15, s13 ; GFX1032-NEXT: s_cselect_b32 s13, s16, s14 @@ -844,14 +845,14 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cmp_eq_u32 s3, s1 ; GFX1032-NEXT: s_cselect_b32 s1, s10, s9 ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_cselect_b32 s5, s13, s5 -; GFX1032-NEXT: s_cselect_b32 s4, s11, s4 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX1032-NEXT: s_cselect_b32 s9, s13, s8 +; GFX1032-NEXT: s_cselect_b32 s8, s11, s5 +; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX1032-NEXT: .LBB15_2: ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX1032-NEXT: s_sub_i32 s3, 0, s0 -; GFX1032-NEXT: s_mov_b32 s5, 0 +; GFX1032-NEXT: s_mov_b32 s9, 0 ; GFX1032-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1032-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -869,15 +870,15 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cselect_b32 s2, s4, s2 ; GFX1032-NEXT: s_add_i32 s3, s1, 1 ; GFX1032-NEXT: s_cmp_ge_u32 s2, s0 -; GFX1032-NEXT: s_cselect_b32 s4, s3, s1 +; GFX1032-NEXT: s_cselect_b32 s8, s3, s1 ; GFX1032-NEXT: .LBB15_3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1032-NEXT: s_endpgm ; GFX1032-NEXT: .LBB15_4: -; GFX1032-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX1032-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1032-NEXT: s_branch .LBB15_2 ; ; GFX1064-LABEL: test_udiv64: @@ -887,6 +888,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1] +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 From 9c017a93224fc67f8c3fc784373fac3441d5004b Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 10 Nov 2025 18:21:59 -0500 Subject: [PATCH 7/7] clang-format-diff --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 1c4177404f64c..b1f8fdc210aff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1335,7 +1335,7 @@ SIInstrInfo::pierceThroughRegSequence(const MachineInstr &MI) const { MRI.getRegClass(RealDefs[(I + 1) % 2]->getOperand(0).getReg()) ->MC->getSizeInBits() * 2 == - MRI.getRegClass(MI.getOperand(0).getReg())->MC->getSizeInBits()) + MRI.getRegClass(MI.getOperand(0).getReg())->MC->getSizeInBits()) return RealDefs[(I + 1) % 2]; return nullptr;