diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 7bbad172b2d42..43405787a0139 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3452,11 +3452,6 @@ class LLVM_ABI TargetLoweringBase { if (isOperationLegal(Opcode, VT)) return true; - // TODO: The default logic is inherited from code in CodeGenPrepare. - // The opcode should not make a difference by default? - if (Opcode != ISD::UADDO) - return false; - // Allow the transform as long as we have an integer type that is not // obviously illegal and unsupported and if the math result is used // besides the overflow check. On some targets (e.g. SPARC), it is @@ -3464,7 +3459,7 @@ class LLVM_ABI TargetLoweringBase { // concrete users. if (VT.isVector()) return false; - return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT)); + return MathUsed && (isTypeLegal(VT) || !isOperationExpand(Opcode, VT)); } // Return true if it is profitable to use a scalar input to a BUILD_VECTOR diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index bdaf48652d107..7e03c4f8af37d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -295,6 +295,15 @@ class AMDGPUTargetLowering : public TargetLowering { bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { return true; } + + bool shouldFormOverflowOp(unsigned Opcode, EVT VT, + bool MathUsed) const override { + if (isOperationLegal(Opcode, VT)) + return true; + + return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT)); + } + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index fa130a153b0de..5417d0ac839a9 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -641,7 +641,6 @@ class VectorType; bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override { - // Using overflow ops for overflow checks only should beneficial on ARM. return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 34854e4d8b6c0..4844a81b8bee2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3405,10 +3405,7 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, bool) const { - // TODO: Allow vectors? - if (VT.isVector()) - return false; - return VT.isSimple() || !isOperationExpand(Opcode, VT); + return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); } bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll index 269cbf03f32a0..606162ade272b 100644 --- a/llvm/test/CodeGen/AArch64/abdu-neg.ll +++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll @@ -355,7 +355,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: abd_cmp_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x1 -; CHECK-NEXT: cneg x0, x8, hs +; CHECK-NEXT: cneg x0, x8, hi ; CHECK-NEXT: ret %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll index b58f6ba96a5b8..3f4d6f722fdb6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll +++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll @@ -9,13 +9,12 @@ define i32 @srl_and() { ; CHECK-LABEL: srl_and: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:g -; CHECK-NEXT: mov w9, #50 ; CHECK-NEXT: ldr x8, [x8, :got_lo12:g] ; CHECK-NEXT: ldrh w8, [x8] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: mov w9, #65535 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: and w0, w8, w8, lsr #16 +; CHECK-NEXT: cmp w8, #50 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: cset w9, ne +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret entry: %0 = load i16, ptr @g, align 4 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll index 66fea3535b1ec..86d8c13811d71 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll @@ -113,10 +113,12 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; CHECK-NEXT: .LBB6_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldaxr w8, [x0] +; CHECK-NEXT: subs w9, w8, #1 +; CHECK-NEXT: cset w10, lo ; CHECK-NEXT: cmp w8, w1 -; CHECK-NEXT: sub w9, w8, #1 -; CHECK-NEXT: ccmp w8, #0, #4, ls -; CHECK-NEXT: csel w9, w1, w9, eq +; CHECK-NEXT: csinc w10, w10, wzr, ls +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: csel w9, w1, w9, ne ; CHECK-NEXT: stlxr w10, w9, [x0] ; CHECK-NEXT: cbnz w10, .LBB6_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end @@ -133,10 +135,12 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: .LBB7_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldaxr x0, [x8] +; CHECK-NEXT: subs x9, x0, #1 +; CHECK-NEXT: cset w10, lo ; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: sub x9, x0, #1 -; CHECK-NEXT: ccmp x0, #0, #4, ls -; CHECK-NEXT: csel x9, x1, x9, eq +; CHECK-NEXT: csinc w10, w10, wzr, ls +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: csel x9, x1, x9, ne ; CHECK-NEXT: stlxr w10, x9, [x8] ; CHECK-NEXT: cbnz w10, .LBB7_1 ; CHECK-NEXT: // %bb.2: // %atomicrmw.end diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll index d307107fc07ee..e49e8e86561c7 100644 --- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll +++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll @@ -108,11 +108,9 @@ define i1 @usubo_ugt_constant_op1_i8(i8 %x, ptr %p) nounwind { define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind { ; CHECK-LABEL: usubo_eq_constant1_op1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: sub w9, w0, #1 -; CHECK-NEXT: cset w8, eq -; CHECK-NEXT: str w9, [x1] -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: subs w8, w0, #1 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %s = add i32 %x, -1 %ov = icmp eq i32 %x, 0 diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll index 3f4dd116d91f8..7917be5728591 100644 --- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll +++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll @@ -192,12 +192,12 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) { ; CHECK-NEXT: mov w22, #2 ; =0x2 ; CHECK-NEXT: LBB3_5: ; %for.cond ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: cbz w22, LBB3_8 +; CHECK-NEXT: subs w22, w22, #1 +; CHECK-NEXT: b.lo LBB3_8 ; CHECK-NEXT: ; %bb.6: ; %for.body ; CHECK-NEXT: ; in Loop: Header=BB3_5 Depth=1 -; CHECK-NEXT: sub w22, w22, #1 -; CHECK-NEXT: orr w9, w21, w20 ; CHECK-NEXT: ldr w10, [x19, w22, sxtw #2] +; CHECK-NEXT: orr w9, w21, w20 ; CHECK-NEXT: cmp w9, w10 ; CHECK-NEXT: b.eq LBB3_5 ; CHECK-NEXT: ; %bb.7: ; %if.then @@ -238,12 +238,12 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) { ; OUTLINE-ATOMICS-NEXT: cset w8, eq ; OUTLINE-ATOMICS-NEXT: LBB3_1: ; %for.cond ; OUTLINE-ATOMICS-NEXT: ; =>This Inner Loop Header: Depth=1 -; OUTLINE-ATOMICS-NEXT: cbz w22, LBB3_4 +; OUTLINE-ATOMICS-NEXT: subs w22, w22, #1 +; OUTLINE-ATOMICS-NEXT: b.lo LBB3_4 ; OUTLINE-ATOMICS-NEXT: ; %bb.2: ; %for.body ; OUTLINE-ATOMICS-NEXT: ; in Loop: Header=BB3_1 Depth=1 -; OUTLINE-ATOMICS-NEXT: sub w22, w22, #1 -; OUTLINE-ATOMICS-NEXT: orr w9, w21, w20 ; OUTLINE-ATOMICS-NEXT: ldr w10, [x19, w22, sxtw #2] +; OUTLINE-ATOMICS-NEXT: orr w9, w21, w20 ; OUTLINE-ATOMICS-NEXT: cmp w9, w10 ; OUTLINE-ATOMICS-NEXT: b.eq LBB3_1 ; OUTLINE-ATOMICS-NEXT: ; %bb.3: ; %if.then diff --git a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll index 1207eaa2612a3..f2c84006910c5 100644 --- a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll +++ b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll @@ -17,24 +17,22 @@ define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 { ; CHECK-ASM-NEXT: .cfi_remember_state ; CHECK-ASM-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-ASM-NEXT: sxtw x8, w0 +; CHECK-ASM-NEXT: mov w9, #10 // =0xa ; CHECK-ASM-NEXT: stp w1, w0, [sp, #8] -; CHECK-ASM-NEXT: cmp x8, #10 -; CHECK-ASM-NEXT: b.hi .LBB0_5 +; CHECK-ASM-NEXT: subs x9, x9, x8 +; CHECK-ASM-NEXT: b.lo .LBB0_5 ; CHECK-ASM-NEXT: // %bb.1: // %entry -; CHECK-ASM-NEXT: mov w9, #10 // =0xa -; CHECK-ASM-NEXT: sub x9, x9, x8 ; CHECK-ASM-NEXT: cbz x9, .LBB0_5 ; CHECK-ASM-NEXT: // %bb.2: ; CHECK-ASM-NEXT: ldrsw x9, [sp, #8] +; CHECK-ASM-NEXT: mov w10, #10 // =0xa +; CHECK-ASM-NEXT: subs x11, x10, x9 ; CHECK-ASM-NEXT: adrp x10, .L_MergedGlobals ; CHECK-ASM-NEXT: add x10, x10, :lo12:.L_MergedGlobals ; CHECK-ASM-NEXT: strb wzr, [x10, x8] -; CHECK-ASM-NEXT: cmp x9, #10 -; CHECK-ASM-NEXT: b.hi .LBB0_6 +; CHECK-ASM-NEXT: b.lo .LBB0_6 ; CHECK-ASM-NEXT: // %bb.3: -; CHECK-ASM-NEXT: mov w8, #10 // =0xa -; CHECK-ASM-NEXT: sub x8, x8, x9 -; CHECK-ASM-NEXT: cbz x8, .LBB0_6 +; CHECK-ASM-NEXT: cbz x11, .LBB0_6 ; CHECK-ASM-NEXT: // %bb.4: ; CHECK-ASM-NEXT: add x8, x10, x9 ; CHECK-ASM-NEXT: strb wzr, [x8, #10] diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll index ecd48d6b7c65b..12044ebe20fa1 100644 --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -25,9 +25,9 @@ define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) { ; CHECK-LABEL: unsigned_sat_constant_i8_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: add w8, w8, #42 -; CHECK-NEXT: tst w8, #0x100 -; CHECK-NEXT: csinv w0, w8, wzr, eq +; CHECK-NEXT: add w9, w0, #42 +; CHECK-NEXT: cmp w8, w9, uxtb +; CHECK-NEXT: csinv w0, w9, wzr, ls ; CHECK-NEXT: ret %a = add i8 %x, 42 %c = icmp ugt i8 %x, %a @@ -68,9 +68,9 @@ define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: add w8, w8, #42 -; CHECK-NEXT: tst w8, #0x10000 -; CHECK-NEXT: csinv w0, w8, wzr, eq +; CHECK-NEXT: add w9, w0, #42 +; CHECK-NEXT: cmp w8, w9, uxth +; CHECK-NEXT: csinv w0, w9, wzr, ls ; CHECK-NEXT: ret %a = add i16 %x, 42 %c = icmp ugt i16 %x, %a @@ -188,9 +188,9 @@ define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) { ; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: add w8, w8, w1, uxtb -; CHECK-NEXT: tst w8, #0x100 -; CHECK-NEXT: csinv w0, w8, wzr, eq +; CHECK-NEXT: add w9, w0, w1 +; CHECK-NEXT: cmp w8, w9, uxtb +; CHECK-NEXT: csinv w0, w9, wzr, ls ; CHECK-NEXT: ret %a = add i8 %x, %y %c = icmp ugt i8 %x, %a @@ -201,11 +201,11 @@ define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) { define i8 @unsigned_sat_variable_i8_using_cmp_notval(i8 %x, i8 %y) { ; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xff -; CHECK-NEXT: add w9, w0, w1 -; CHECK-NEXT: add w8, w8, w0, uxtb -; CHECK-NEXT: tst w8, #0x100 -; CHECK-NEXT: csinv w0, w9, wzr, eq +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mvn w9, w1 +; CHECK-NEXT: add w10, w0, w1 +; CHECK-NEXT: cmp w8, w9, uxtb +; CHECK-NEXT: csinv w0, w10, wzr, ls ; CHECK-NEXT: ret %noty = xor i8 %y, -1 %a = add i8 %x, %y @@ -234,9 +234,9 @@ define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) { ; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: add w8, w8, w1, uxth -; CHECK-NEXT: tst w8, #0x10000 -; CHECK-NEXT: csinv w0, w8, wzr, eq +; CHECK-NEXT: add w9, w0, w1 +; CHECK-NEXT: cmp w8, w9, uxth +; CHECK-NEXT: csinv w0, w9, wzr, ls ; CHECK-NEXT: ret %a = add i16 %x, %y %c = icmp ugt i16 %x, %a @@ -247,11 +247,11 @@ define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) { define i16 @unsigned_sat_variable_i16_using_cmp_notval(i16 %x, i16 %y) { ; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xffff -; CHECK-NEXT: add w9, w0, w1 -; CHECK-NEXT: add w8, w8, w0, uxth -; CHECK-NEXT: tst w8, #0x10000 -; CHECK-NEXT: csinv w0, w9, wzr, eq +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: mvn w9, w1 +; CHECK-NEXT: add w10, w0, w1 +; CHECK-NEXT: cmp w8, w9, uxth +; CHECK-NEXT: csinv w0, w10, wzr, ls ; CHECK-NEXT: ret %noty = xor i16 %y, -1 %a = add i16 %x, %y diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll index 7c80f9320faec..0720a7f72bd8c 100644 --- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll +++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll @@ -313,9 +313,9 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind { define i1 @add_ultcmp_bad_i8_i16(i16 %x) nounwind { ; CHECK-LABEL: add_ultcmp_bad_i8_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: add w8, w8, #128 -; CHECK-NEXT: lsr w0, w8, #16 +; CHECK-NEXT: add w8, w0, #128 +; CHECK-NEXT: tst w8, #0xff80 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i16 %x, 128 ; 1U << (8-1) %tmp1 = icmp ult i16 %tmp0, 128 ; 1U << (8-1) diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 7cc505171da82..869c1f82e976b 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -8287,22 +8287,22 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB109_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, -1, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: s_or_b64 vcc, s[4:5], vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART @@ -8343,10 +8343,10 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] ; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 -; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[4:5], v[2:3] +; GFX950-NEXT: s_or_b64 vcc, s[0:1], vcc ; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc @@ -8391,19 +8391,19 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX90A-NEXT: s_cbranch_execz .LBB110_4 ; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, -1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GFX90A-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB110_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -8440,10 +8440,10 @@ define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] ; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 -; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[4:5], v[0:1] +; GFX950-NEXT: s_or_b64 vcc, s[0:1], vcc ; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off @@ -16607,23 +16607,23 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, -1, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: s_or_b64 vcc, s[4:5], vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -16661,16 +16661,16 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] ; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 -; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[4:5], v[2:3] +; GFX950-NEXT: s_or_b64 vcc, s[0:1], vcc ; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2 -; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end +; GFX950-NEXT: .LBB217_4: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -16708,20 +16708,20 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: .LBB218_3: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: .LBB218_4: ; %atomicrmw.end +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, -1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] +; GFX90A-NEXT: s_or_b64 vcc, s[4:5], vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB218_4: ; %atomicrmw.phi ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use v[2:3] @@ -16738,33 +16738,33 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB218_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_cbranch_execz .LBB218_3 ; GFX950-NEXT: s_branch .LBB218_4 ; GFX950-NEXT: .LBB218_2: -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB218_3: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s2, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 -; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2 -; GFX950-NEXT: .LBB218_4: ; %atomicrmw.end +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[4:5], v[2:3] +; GFX950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2 +; GFX950-NEXT: .LBB218_4: ; %atomicrmw.phi ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 9e27f6badfdac..3cc8b60c41389 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -23477,28 +23477,28 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB141_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: ; %bb.1: ; %Flow5 +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB141_6 ; GCN1-NEXT: .LBB141_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: s_mov_b64 s[8:9], 0 ; GCN1-NEXT: .LBB141_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 -; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -23506,33 +23506,34 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_cbranch_execnz .LBB141_4 ; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN1-NEXT: s_cbranch_execz .LBB141_2 ; GCN1-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GCN1-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -23544,28 +23545,28 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB141_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: ; %bb.1: ; %Flow5 +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB141_6 ; GCN2-NEXT: .LBB141_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 ; GCN2-NEXT: .LBB141_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 -; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -23573,33 +23574,34 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_cbranch_execnz .LBB141_4 ; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN2-NEXT: s_cbranch_execz .LBB141_2 ; GCN2-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GCN2-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -23609,25 +23611,25 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB141_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: ; %bb.1: ; %Flow5 +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB141_6 ; GCN3-NEXT: .LBB141_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 ; GCN3-NEXT: .LBB141_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 -; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v7, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -23635,32 +23637,33 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_cbranch_execnz .LBB141_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execz .LBB141_2 ; GCN3-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -23678,28 +23681,28 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB142_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: ; %bb.1: ; %Flow5 +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB142_6 ; GCN1-NEXT: .LBB142_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: s_mov_b64 s[8:9], 0 ; GCN1-NEXT: .LBB142_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 -; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -23707,33 +23710,34 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_cbranch_execnz .LBB142_4 ; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN1-NEXT: s_cbranch_execz .LBB142_2 ; GCN1-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GCN1-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -23747,28 +23751,28 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB142_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: ; %bb.1: ; %Flow5 +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB142_6 ; GCN2-NEXT: .LBB142_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 ; GCN2-NEXT: .LBB142_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 -; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -23776,33 +23780,34 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_cbranch_execnz .LBB142_4 ; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN2-NEXT: s_cbranch_execz .LBB142_2 ; GCN2-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GCN2-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -23814,25 +23819,25 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB142_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: ; %bb.1: ; %Flow5 +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB142_6 ; GCN3-NEXT: .LBB142_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 ; GCN3-NEXT: .LBB142_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 -; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v7, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -23840,32 +23845,33 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_cbranch_execnz .LBB142_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execz .LBB142_2 ; GCN3-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -23883,59 +23889,59 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB143_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: s_mov_b64 s[8:9], 0 ; GCN1-NEXT: .LBB143_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 -; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_cbranch_execnz .LBB143_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: .LBB143_4: ; %Flow3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: .LBB143_4: ; %Flow5 +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN1-NEXT: s_cbranch_execz .LBB143_6 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GCN1-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v4 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v5, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[4:5] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GCN1-NEXT: .LBB143_6: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -23950,59 +23956,59 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB143_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 ; GCN2-NEXT: .LBB143_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 -; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_cbranch_execnz .LBB143_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: .LBB143_4: ; %Flow3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: .LBB143_4: ; %Flow5 +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN2-NEXT: s_cbranch_execz .LBB143_6 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GCN2-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v4 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v4 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v5, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[4:5] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; GCN2-NEXT: .LBB143_6: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -24017,59 +24023,59 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB143_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: ; %bb.1: ; %Flow5 +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB143_6 ; GCN3-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 ; GCN3-NEXT: .LBB143_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v9, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_cbranch_execnz .LBB143_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execz .LBB143_2 ; GCN3-NEXT: .LBB143_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -24088,63 +24094,63 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB144_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: ; %bb.1: ; %Flow5 +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN1-NEXT: s_cbranch_execnz .LBB144_6 ; GCN1-NEXT: .LBB144_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB144_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: s_mov_b64 s[8:9], 0 ; GCN1-NEXT: .LBB144_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 -; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_cbranch_execnz .LBB144_4 ; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN1-NEXT: s_cbranch_execz .LBB144_2 ; GCN1-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GCN1-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -24159,63 +24165,63 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB144_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: ; %bb.1: ; %Flow5 +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN2-NEXT: s_cbranch_execnz .LBB144_6 ; GCN2-NEXT: .LBB144_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB144_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: s_mov_b64 s[8:9], 0 ; GCN2-NEXT: .LBB144_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 -; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_cbranch_execnz .LBB144_4 ; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN2-NEXT: s_cbranch_execz .LBB144_2 ; GCN2-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GCN2-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -24228,59 +24234,59 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB144_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: ; %bb.1: ; %Flow5 +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execnz .LBB144_6 ; GCN3-NEXT: .LBB144_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 ; GCN3-NEXT: .LBB144_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v9, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_cbranch_execnz .LBB144_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; GCN3-NEXT: s_cbranch_execz .LBB144_2 ; GCN3-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -24300,7 +24306,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow5 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN1-NEXT: .LBB145_2: ; %atomicrmw.phi @@ -24314,17 +24320,17 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB145_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24332,34 +24338,35 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB145_4 ; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_branch .LBB145_2 ; GCN1-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s34 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -24374,7 +24381,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow5 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN2-NEXT: .LBB145_2: ; %atomicrmw.phi @@ -24388,17 +24395,17 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB145_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24406,33 +24413,34 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB145_4 ; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_branch .LBB145_2 ; GCN2-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s34 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -24445,7 +24453,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow5 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN3-NEXT: .LBB145_2: ; %atomicrmw.phi @@ -24454,17 +24462,17 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB145_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24472,31 +24480,32 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB145_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_branch .LBB145_2 ; GCN3-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v5, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -24509,39 +24518,39 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN1-NEXT: s_add_u32 s38, s4, 32 -; GCN1-NEXT: s_addc_u32 s39, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 32 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s39, s34 +; GCN1-NEXT: s_cmp_eq_u32 s37, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: ; %bb.1: ; %Flow5 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN1-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN1-NEXT: s_add_u32 s34, s38, 4 -; GCN1-NEXT: s_addc_u32 s35, s39, 0 +; GCN1-NEXT: s_add_u32 s34, s36, 4 +; GCN1-NEXT: s_addc_u32 s35, s37, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v4, s38 -; GCN1-NEXT: v_mov_b32_e32 v5, s39 +; GCN1-NEXT: v_mov_b32_e32 v4, s36 +; GCN1-NEXT: v_mov_b32_e32 v5, s37 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] -; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: s_mov_b64 s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: .LBB146_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24549,34 +24558,35 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN1-NEXT: v_mov_b32_e32 v3, v1 -; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_cbranch_execnz .LBB146_4 ; GCN1-NEXT: ; %bb.5: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_branch .LBB146_2 ; GCN1-NEXT: .LBB146_6: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec -; GCN1-NEXT: s_cselect_b32 s34, s38, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_cselect_b32 s34, s36, -1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s34 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -24585,39 +24595,39 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN2-NEXT: s_add_u32 s38, s4, 32 -; GCN2-NEXT: s_addc_u32 s39, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 32 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s39, s34 +; GCN2-NEXT: s_cmp_eq_u32 s37, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: ; %bb.1: ; %Flow5 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN2-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN2-NEXT: s_add_u32 s34, s38, 4 -; GCN2-NEXT: s_addc_u32 s35, s39, 0 +; GCN2-NEXT: s_add_u32 s34, s36, 4 +; GCN2-NEXT: s_addc_u32 s35, s37, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v4, s38 -; GCN2-NEXT: v_mov_b32_e32 v5, s39 +; GCN2-NEXT: v_mov_b32_e32 v4, s36 +; GCN2-NEXT: v_mov_b32_e32 v5, s37 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] -; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: s_mov_b64 s[38:39], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: .LBB146_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24625,67 +24635,68 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN2-NEXT: v_mov_b32_e32 v3, v1 -; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_cbranch_execnz .LBB146_4 ; GCN2-NEXT: ; %bb.5: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_branch .LBB146_2 ; GCN2-NEXT: .LBB146_6: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0 -; GCN2-NEXT: s_cselect_b32 s34, s38, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_cmp_lg_u64 s[36:37], 0 +; GCN2-NEXT: s_cselect_b32 s34, s36, -1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s34 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s38, s4, 32 +; GCN3-NEXT: s_add_u32 s36, s4, 32 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base -; GCN3-NEXT: s_addc_u32 s39, s5, 0 -; GCN3-NEXT: s_cmp_eq_u32 s39, s35 +; GCN3-NEXT: s_addc_u32 s37, s5, 0 +; GCN3-NEXT: s_cmp_eq_u32 s37, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: ; %bb.1: ; %Flow5 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN3-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v4, s38 -; GCN3-NEXT: v_mov_b32_e32 v5, s39 +; GCN3-NEXT: v_mov_b32_e32 v4, s36 +; GCN3-NEXT: v_mov_b32_e32 v5, s37 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] -; GCN3-NEXT: s_mov_b64 s[40:41], 0 +; GCN3-NEXT: s_mov_b64 s[38:39], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: .LBB146_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc @@ -24693,31 +24704,32 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GCN3-NEXT: v_mov_b32_e32 v3, v1 -; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_cbranch_execnz .LBB146_4 ; GCN3-NEXT: ; %bb.5: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_branch .LBB146_2 ; GCN3-NEXT: .LBB146_6: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0 -; GCN3-NEXT: s_cselect_b32 s34, s38, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_cmp_lg_u64 s[36:37], 0 +; GCN3-NEXT: s_cselect_b32 s34, s36, -1 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v5, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -24745,7 +24757,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB147_2: ; %atomicrmw.start @@ -24753,48 +24765,48 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_cbranch_execnz .LBB147_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_branch .LBB147_6 ; GCN1-NEXT: .LBB147_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_cbranch_execz .LBB147_6 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s34 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -24818,7 +24830,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] -; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB147_2: ; %atomicrmw.start @@ -24826,22 +24838,22 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_cbranch_execnz .LBB147_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_branch .LBB147_6 ; GCN2-NEXT: .LBB147_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -24849,24 +24861,24 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s34 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -24883,7 +24895,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB147_2: ; %atomicrmw.start @@ -24891,22 +24903,22 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v9, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_cbranch_execnz .LBB147_2 ; GCN3-NEXT: ; %bb.3: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_branch .LBB147_6 ; GCN3-NEXT: .LBB147_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -24914,22 +24926,22 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v5, s7 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -24943,23 +24955,23 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN1-NEXT: s_add_u32 s38, s4, 32 -; GCN1-NEXT: s_addc_u32 s39, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 32 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s39, s34 +; GCN1-NEXT: s_cmp_eq_u32 s37, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_cbranch_vccz .LBB148_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: s_add_u32 s34, s38, 4 -; GCN1-NEXT: s_addc_u32 s35, s39, 0 +; GCN1-NEXT: s_add_u32 s34, s36, 4 +; GCN1-NEXT: s_addc_u32 s35, s37, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s38 -; GCN1-NEXT: v_mov_b32_e32 v3, s39 +; GCN1-NEXT: v_mov_b32_e32 v2, s36 +; GCN1-NEXT: v_mov_b32_e32 v3, s37 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] -; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: s_mov_b64 s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: .LBB148_2: ; %atomicrmw.start @@ -24967,48 +24979,48 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_cbranch_execnz .LBB148_2 ; GCN1-NEXT: ; %bb.3: ; %Flow -; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_branch .LBB148_6 ; GCN1-NEXT: .LBB148_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_cbranch_execz .LBB148_6 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0 -; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec -; GCN1-NEXT: s_cselect_b32 s34, s38, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_cselect_b32 s34, s36, -1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s34 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN1-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -25018,23 +25030,23 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 -; GCN2-NEXT: s_add_u32 s38, s4, 32 -; GCN2-NEXT: s_addc_u32 s39, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 32 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s39, s34 +; GCN2-NEXT: s_cmp_eq_u32 s37, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_cbranch_vccz .LBB148_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: s_add_u32 s34, s38, 4 -; GCN2-NEXT: s_addc_u32 s35, s39, 0 +; GCN2-NEXT: s_add_u32 s34, s36, 4 +; GCN2-NEXT: s_addc_u32 s35, s37, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s38 -; GCN2-NEXT: v_mov_b32_e32 v3, s39 +; GCN2-NEXT: v_mov_b32_e32 v2, s36 +; GCN2-NEXT: v_mov_b32_e32 v3, s37 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] -; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: s_mov_b64 s[38:39], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: .LBB148_2: ; %atomicrmw.start @@ -25042,47 +25054,47 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_cbranch_execnz .LBB148_2 ; GCN2-NEXT: ; %bb.3: ; %Flow -; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_branch .LBB148_6 ; GCN2-NEXT: .LBB148_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_cbranch_execz .LBB148_6 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0 -; GCN2-NEXT: s_cselect_b32 s34, s38, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_cmp_lg_u64 s[36:37], 0 +; GCN2-NEXT: s_cselect_b32 s34, s36, -1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s34 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN2-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -25090,18 +25102,18 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s38, s4, 32 +; GCN3-NEXT: s_add_u32 s36, s4, 32 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base -; GCN3-NEXT: s_addc_u32 s39, s5, 0 -; GCN3-NEXT: s_cmp_eq_u32 s39, s35 +; GCN3-NEXT: s_addc_u32 s37, s5, 0 +; GCN3-NEXT: s_cmp_eq_u32 s37, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_cbranch_vccz .LBB148_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v2, s38 -; GCN3-NEXT: v_mov_b32_e32 v3, s39 +; GCN3-NEXT: v_mov_b32_e32 v2, s36 +; GCN3-NEXT: v_mov_b32_e32 v3, s37 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] -; GCN3-NEXT: s_mov_b64 s[40:41], 0 +; GCN3-NEXT: s_mov_b64 s[38:39], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: .LBB148_2: ; %atomicrmw.start @@ -25109,45 +25121,45 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v9, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc ; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_cbranch_execnz .LBB148_2 ; GCN3-NEXT: ; %bb.3: ; %Flow -; GCN3-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_branch .LBB148_6 ; GCN3-NEXT: .LBB148_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_cbranch_execz .LBB148_6 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0 -; GCN3-NEXT: s_cselect_b32 s34, s38, -1 -; GCN3-NEXT: v_mov_b32_e32 v2, s34 -; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: v_mov_b32_e32 v4, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_cmp_lg_u64 s[36:37], 0 +; GCN3-NEXT: s_cselect_b32 s34, s36, -1 +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v5, s7 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[34:35], v[2:3], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[34:35], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -25170,10 +25182,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB149_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB149_4 ; GCN1-NEXT: .LBB149_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -25181,25 +25193,26 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB149_2 ; GCN1-NEXT: .LBB149_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GCN1-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -25216,10 +25229,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB149_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB149_4 ; GCN2-NEXT: .LBB149_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -25227,25 +25240,26 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB149_2 ; GCN2-NEXT: .LBB149_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GCN2-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v7, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -25260,10 +25274,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB149_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB149_4 ; GCN3-NEXT: .LBB149_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -25271,24 +25285,25 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB149_2 ; GCN3-NEXT: .LBB149_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -25311,10 +25326,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB150_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB150_4 ; GCN1-NEXT: .LBB150_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB150_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -25322,26 +25337,26 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB150_2 ; GCN1-NEXT: .LBB150_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GCN1-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN1-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN1-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -25359,10 +25374,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB150_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB150_4 ; GCN2-NEXT: .LBB150_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB150_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -25370,26 +25385,26 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB150_2 ; GCN2-NEXT: .LBB150_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GCN2-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN2-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN2-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -25405,10 +25420,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB150_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB150_4 ; GCN3-NEXT: .LBB150_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB150_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -25416,25 +25431,25 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB150_2 ; GCN3-NEXT: .LBB150_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 -; GCN3-NEXT: s_waitcnt vmcnt(0) -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, -1, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[0:1] +; GCN3-NEXT: s_or_b64 vcc, s[4:5], vcc +; GCN3-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 diff --git a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll index 433fb325a7349..c37afeeea375d 100644 --- a/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/ARM/atomicrmw-uinc-udec-wrap.ll @@ -147,11 +147,11 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; CHECK-NEXT: .LBB6_1: @ %atomicrmw.start ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrex r12, [r0] -; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: subls r3, r12, #1 -; CHECK-NEXT: cmp r12, #0 -; CHECK-NEXT: moveq r3, r1 +; CHECK-NEXT: sub r3, r12, #1 +; CHECK-NEXT: movhi r3, r1 +; CHECK-NEXT: cmp r12, #1 +; CHECK-NEXT: movlo r3, r1 ; CHECK-NEXT: strex r2, r3, [r0] ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: bne .LBB6_1 diff --git a/llvm/test/CodeGen/ARM/select_const.ll b/llvm/test/CodeGen/ARM/select_const.ll index 180daa12e7c52..6d8a7f70754d3 100644 --- a/llvm/test/CodeGen/ARM/select_const.ll +++ b/llvm/test/CodeGen/ARM/select_const.ll @@ -763,46 +763,35 @@ define i64 @opaque_constant2(i1 %cond, i64 %x) { define i64 @func(i64 %arg) { ; ARM-LABEL: func: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: adds r0, r0, #1 -; ARM-NEXT: mov r2, #0 -; ARM-NEXT: adcs r0, r1, #0 +; ARM-NEXT: and r0, r0, r1 ; ARM-NEXT: mov r1, #0 -; ARM-NEXT: adcs r0, r2, #0 -; ARM-NEXT: movne r0, #8 +; ARM-NEXT: cmn r0, #1 +; ARM-NEXT: mov r0, #0 +; ARM-NEXT: moveq r0, #8 ; ARM-NEXT: mov pc, lr ; ; THUMB2-LABEL: func: ; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: ands r0, r1 +; THUMB2-NEXT: movs r1, #0 ; THUMB2-NEXT: adds r0, #1 -; THUMB2-NEXT: mov.w r2, #0 -; THUMB2-NEXT: adcs r0, r1, #0 -; THUMB2-NEXT: mov.w r1, #0 -; THUMB2-NEXT: adcs r0, r2, #0 -; THUMB2-NEXT: it ne -; THUMB2-NEXT: movne r0, #8 +; THUMB2-NEXT: mov.w r0, #0 +; THUMB2-NEXT: it eq +; THUMB2-NEXT: moveq r0, #8 ; THUMB2-NEXT: bx lr ; ; THUMB-LABEL: func: ; THUMB: @ %bb.0: @ %entry -; THUMB-NEXT: .save {r4, lr} -; THUMB-NEXT: push {r4, lr} -; THUMB-NEXT: movs r2, #0 -; THUMB-NEXT: adds r3, r0, #1 -; THUMB-NEXT: mov r12, r1 -; THUMB-NEXT: mov r3, r12 -; THUMB-NEXT: adcs r3, r2 -; THUMB-NEXT: mov r12, r2 -; THUMB-NEXT: mov r3, r12 -; THUMB-NEXT: adcs r3, r2 -; THUMB-NEXT: subs r4, r3, #1 +; THUMB-NEXT: ands r0, r1 +; THUMB-NEXT: movs r1, #0 ; THUMB-NEXT: adds r0, r0, #1 -; THUMB-NEXT: adcs r1, r2 -; THUMB-NEXT: sbcs r3, r4 -; THUMB-NEXT: lsls r0, r3, #3 -; THUMB-NEXT: movs r1, r2 -; THUMB-NEXT: pop {r4} -; THUMB-NEXT: pop {r2} -; THUMB-NEXT: bx r2 +; THUMB-NEXT: beq .LBB26_2 +; THUMB-NEXT: @ %bb.1: @ %entry +; THUMB-NEXT: movs r0, r1 +; THUMB-NEXT: bx lr +; THUMB-NEXT: .LBB26_2: +; THUMB-NEXT: movs r0, #8 +; THUMB-NEXT: bx lr entry: %0 = add i64 %arg, 1 %1 = icmp ult i64 %0, 1 diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll index 8e673c1bb06ba..55a8f2eb768c8 100644 --- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll @@ -184,53 +184,53 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r0 = and(#24,asl(r0,#3)) -; CHECK-NEXT: r3 = and(r0,#-4) -; CHECK-NEXT: r4 = #255 -; CHECK-NEXT: r5 = and(r1,#255) +; CHECK-NEXT: r2 = and(r0,#-4) +; CHECK-NEXT: r3 = #255 +; CHECK-NEXT: r4 = and(r1,#255) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = asl(r4,r0) +; CHECK-NEXT: r5 = asl(r3,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r6 = sub(#-1,r2) +; CHECK-NEXT: r5 = sub(#-1,r5) ; CHECK-NEXT: } ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB4_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { -; CHECK-NEXT: r7 = memw_locked(r3) +; CHECK-NEXT: r7 = #255 +; CHECK-NEXT: r6 = memw_locked(r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = lsr(r7,r0) -; CHECK-NEXT: r7 = and(r7,r6) +; CHECK-NEXT: r7 &= lsr(r6,r0) +; CHECK-NEXT: r8 = and(r6,r5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p0 = bitsclr(r2,r4) -; CHECK-NEXT: r8 = and(r2,#255) +; CHECK-NEXT: p0 = cmp.gtu(r7,r4) +; CHECK-NEXT: r7 = add(r7,#-1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p1 = cmp.gtu(r8,r5) -; CHECK-NEXT: if (p1.new) r8 = add(r1,#0) -; CHECK-NEXT: if (!p1.new) r8 = add(r2,#-1) +; CHECK-NEXT: p1 = !bitsset(r3,r7) +; CHECK-NEXT: r9 = mux(p0,r1,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) r8 = add(r1,#0) +; CHECK-NEXT: if (p1) r9 = add(r1,#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r8 = and(r8,#255) +; CHECK-NEXT: r7 = and(r9,#255) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r7 |= asl(r8,r0) +; CHECK-NEXT: r8 |= asl(r7,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw_locked(r3,p0) = r7 +; CHECK-NEXT: memw_locked(r2,p0) = r8 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: if (!p0) jump:nt .LBB4_1 ; CHECK-NEXT: } ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: { -; CHECK-NEXT: r0 = r2 +; CHECK-NEXT: r0 = lsr(r6,r0) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst @@ -243,53 +243,53 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r0 = and(#24,asl(r0,#3)) -; CHECK-NEXT: r3 = and(r0,#-4) -; CHECK-NEXT: r4 = ##65535 +; CHECK-NEXT: r2 = and(r0,#-4) +; CHECK-NEXT: r3 = ##65535 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = asl(r4,r0) -; CHECK-NEXT: r5 = zxth(r1) +; CHECK-NEXT: r5 = asl(r3,r0) +; CHECK-NEXT: r4 = zxth(r1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r6 = sub(#-1,r2) +; CHECK-NEXT: r5 = sub(#-1,r5) ; CHECK-NEXT: } ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { -; CHECK-NEXT: r7 = memw_locked(r3) +; CHECK-NEXT: r7 = ##65535 +; CHECK-NEXT: r6 = memw_locked(r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = lsr(r7,r0) -; CHECK-NEXT: r7 = and(r7,r6) +; CHECK-NEXT: r7 &= lsr(r6,r0) +; CHECK-NEXT: r8 = and(r6,r5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p0 = bitsclr(r2,r4) -; CHECK-NEXT: r8 = zxth(r2) +; CHECK-NEXT: p0 = cmp.gtu(r7,r4) +; CHECK-NEXT: r7 = add(r7,#-1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p1 = cmp.gtu(r8,r5) -; CHECK-NEXT: if (p1.new) r8 = add(r1,#0) -; CHECK-NEXT: if (!p1.new) r8 = add(r2,#-1) +; CHECK-NEXT: p1 = !bitsset(r3,r7) +; CHECK-NEXT: r9 = mux(p0,r1,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) r8 = add(r1,#0) +; CHECK-NEXT: if (p1) r9 = add(r1,#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r8 = zxth(r8) +; CHECK-NEXT: r7 = zxth(r9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r7 |= asl(r8,r0) +; CHECK-NEXT: r8 |= asl(r7,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw_locked(r3,p0) = r7 +; CHECK-NEXT: memw_locked(r2,p0) = r8 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: if (!p0) jump:nt .LBB5_1 ; CHECK-NEXT: } ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: { -; CHECK-NEXT: r0 = r2 +; CHECK-NEXT: r0 = lsr(r6,r0) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst @@ -308,15 +308,17 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: p0 = cmp.gtu(r2,r1) -; CHECK-NEXT: p1 = cmp.eq(r2,#0) -; CHECK-NEXT: if (p0.new) r3 = add(r1,#0) -; CHECK-NEXT: if (!p0.new) r3 = add(r2,#-1) +; CHECK-NEXT: r3 = add(r2,#-1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: if (p1) r3 = add(r1,#0) +; CHECK-NEXT: p1 = cmp.eq(r3,#-1) +; CHECK-NEXT: r4 = mux(p0,r1,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memw_locked(r0,p0) = r3 +; CHECK-NEXT: if (p1) r4 = add(r1,#0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: memw_locked(r0,p0) = r4 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: if (!p0) jump:nt .LBB6_1 @@ -336,7 +338,6 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { ; CHECK-NEXT: r7:6 = combine(#-1,#-1) -; CHECK-NEXT: r9:8 = combine(#0,#0) ; CHECK-NEXT: } ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_1: // %atomicrmw.start @@ -345,22 +346,20 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: r5:4 = memd_locked(r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p1 = cmp.gtu(r5:4,r3:2) -; CHECK-NEXT: p0 = cmp.eq(r5:4,r9:8) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = add(r5:4,r7:6) +; CHECK-NEXT: p0 = cmp.gtu(r5:4,r3:2) +; CHECK-NEXT: r9:8 = add(r5:4,r7:6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r1 = mux(p1,r2,r12) -; CHECK-NEXT: r14 = mux(p1,r3,r13) +; CHECK-NEXT: p1 = cmp.eq(r9:8,r7:6) +; CHECK-NEXT: r1 = mux(p0,r2,r8) +; CHECK-NEXT: r12 = mux(p0,r3,r9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r10 = mux(p0,r2,r1) -; CHECK-NEXT: r11 = mux(p0,r3,r14) +; CHECK-NEXT: r14 = mux(p1,r2,r1) +; CHECK-NEXT: r15 = mux(p1,r3,r12) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: memd_locked(r0,p0) = r11:10 +; CHECK-NEXT: memd_locked(r0,p0) = r15:14 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: if (!p0) jump:nt .LBB7_1 diff --git a/llvm/test/CodeGen/Hexagon/loop-balign.ll b/llvm/test/CodeGen/Hexagon/loop-balign.ll index 78285f6d1ae64..c3b27a84ac3f1 100644 --- a/llvm/test/CodeGen/Hexagon/loop-balign.ll +++ b/llvm/test/CodeGen/Hexagon/loop-balign.ll @@ -1,9 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=hexagon -O3 < %s | FileCheck %s -check-prefix=BALIGN ; BALIGN: .p2align{{.*}}5 ; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr { +; BALIGN-LABEL: foo: +; BALIGN: .cfi_startproc +; BALIGN-NEXT: // %bb.0: // %entry +; BALIGN-NEXT: { +; BALIGN-NEXT: r5 = asl(r1,#2) +; BALIGN-NEXT: r3 = add(r0,#-1) +; BALIGN-NEXT: r4 = #-2 +; BALIGN-NEXT: } +; BALIGN-NEXT: // implicit-def: $d3 +; BALIGN-NEXT: { +; BALIGN-NEXT: p0 = cmp.gt(r3,#0) +; BALIGN-NEXT: r3 = #0 +; BALIGN-NEXT: r8 = r5 +; BALIGN-NEXT: if (!p0.new) r0 = #1 +; BALIGN-NEXT: } +; BALIGN-NEXT: { +; BALIGN-NEXT: p0 = cmp.gt(r1,#0) +; BALIGN-NEXT: jump .LBB0_1 +; BALIGN-NEXT: } +; BALIGN-NEXT: .p2align 4 +; BALIGN-NEXT: .LBB0_8: // %for.end7 +; BALIGN-NEXT: // in Loop: Header=BB0_1 Depth=1 +; BALIGN-NEXT: { +; BALIGN-NEXT: r3 = add(r3,#1) +; BALIGN-NEXT: r4 = add(r4,#1) +; BALIGN-NEXT: } +; BALIGN-NEXT: { +; BALIGN-NEXT: p1 = cmp.eq(r3,r0) +; BALIGN-NEXT: if (p1.new) jumpr:nt r31 +; BALIGN-NEXT: } +; BALIGN-NEXT: .LBB0_1: // %Outerloop +; BALIGN-NEXT: // =>This Loop Header: Depth=1 +; BALIGN-NEXT: // Child Loop BB0_3 Depth 2 +; BALIGN-NEXT: // Child Loop BB0_6 Depth 3 +; BALIGN-NEXT: { +; BALIGN-NEXT: if (!p0) jump:nt .LBB0_8 +; BALIGN-NEXT: } +; BALIGN-NEXT: // %bb.2: // %for.body.lr.ph +; BALIGN-NEXT: // in Loop: Header=BB0_1 Depth=1 +; BALIGN-NEXT: { +; BALIGN-NEXT: loop1(.LBB0_3,r1) +; BALIGN-NEXT: p1 = cmp.eq(r3,#0) +; BALIGN-NEXT: p2 = cmp.eq(r3,#1) +; BALIGN-NEXT: jump .LBB0_3 +; BALIGN-NEXT: } +; BALIGN-NEXT: .p2align 4 +; BALIGN-NEXT: .LBB0_7: // %for.end +; BALIGN-NEXT: // in Loop: Header=BB0_3 Depth=2 +; BALIGN-NEXT: { +; BALIGN-NEXT: r9 = clb(r7:6) +; BALIGN-NEXT: memw(r2+#0) = r9.new +; BALIGN-NEXT: } +; BALIGN-NEXT: { +; BALIGN-NEXT: nop +; BALIGN-NEXT: nop +; BALIGN-NEXT: nop +; BALIGN-NEXT: } :endloop1 +; BALIGN-NEXT: { +; BALIGN-NEXT: jump .LBB0_8 +; BALIGN-NEXT: } +; BALIGN-NEXT: .LBB0_3: // Block address taken +; BALIGN-NEXT: // %for.body +; BALIGN-NEXT: // Parent Loop BB0_1 Depth=1 +; BALIGN-NEXT: // => This Loop Header: Depth=2 +; BALIGN-NEXT: // Child Loop BB0_6 Depth 3 +; BALIGN-NEXT: { +; BALIGN-NEXT: r12 = r8 +; BALIGN-NEXT: r8 = add(r8,r5) +; BALIGN-NEXT: if (p1) jump:nt .LBB0_7 +; BALIGN-NEXT: } +; BALIGN-NEXT: // %bb.4: // %for.body4.peel +; BALIGN-NEXT: // in Loop: Header=BB0_3 Depth=2 +; BALIGN-NEXT: { +; BALIGN-NEXT: r12 = memw(r12+#0) +; BALIGN-NEXT: } +; BALIGN-NEXT: { +; BALIGN-NEXT: r7:6 -= mpy(r12,r9) +; BALIGN-NEXT: if (p2) jump:nt .LBB0_7 +; BALIGN-NEXT: } +; BALIGN-NEXT: // %bb.5: // %for.body4.preheader.peel.newph +; BALIGN-NEXT: // in Loop: Header=BB0_3 Depth=2 +; BALIGN-NEXT: { +; BALIGN-NEXT: r13 = add(r4,#1) +; BALIGN-NEXT: r12 = memw(r8+#0) +; BALIGN-NEXT: } +; BALIGN-NEXT: { +; BALIGN-NEXT: loop0(.LBB0_6,r13) +; BALIGN-NEXT: } +; BALIGN-NEXT: .p2align 4 +; BALIGN-NEXT: .LBB0_6: // Block address taken +; BALIGN-NEXT: // %for.body4 +; BALIGN-NEXT: // Parent Loop BB0_1 Depth=1 +; BALIGN-NEXT: // Parent Loop BB0_3 Depth=2 +; BALIGN-NEXT: // => This Inner Loop Header: Depth=3 +; BALIGN-NEXT: { +; BALIGN-NEXT: r7:6 -= mpy(r12,r9) +; BALIGN-NEXT: nop +; BALIGN-NEXT: } :endloop0 +; BALIGN-NEXT: { +; BALIGN-NEXT: jump .LBB0_7 +; BALIGN-NEXT: } entry: %shl = shl i32 %nRow, 2 %cmp36 = icmp sgt i32 %nRow, 0 @@ -85,7 +187,7 @@ if.end: ; preds = %for.end7 } ; Function Attrs: nounwind readnone -declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32) +declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32) ; Function Attrs: nounwind readnone declare i32 @llvm.hexagon.S2.clbp(i64) diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index cdbbabe3e3b05..25aff73a38b82 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -61,21 +61,21 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: // %bb.3: // %udiv-bb1 ; CHECK-NEXT: add.cc.s64 %rd71, %rd26, 1; ; CHECK-NEXT: addc.cc.s64 %rd72, %rd27, 0; -; CHECK-NEXT: or.b64 %rd30, %rd71, %rd72; -; CHECK-NEXT: setp.eq.b64 %p16, %rd30, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd26; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd31, %rd3, %r6; +; CHECK-NEXT: shl.b64 %rd30, %rd3, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd32, %rd2, %r7; -; CHECK-NEXT: or.b64 %rd33, %rd31, %rd32; +; CHECK-NEXT: shr.u64 %rd31, %rd2, %r7; +; CHECK-NEXT: or.b64 %rd32, %rd30, %rd31; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd34, %rd2, %r8; -; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd76, %rd34, %rd33, %p17; +; CHECK-NEXT: shl.b64 %rd33, %rd2, %r8; +; CHECK-NEXT: setp.gt.s32 %p16, %r6, 63; +; CHECK-NEXT: selp.b64 %rd76, %rd33, %rd32, %p16; ; CHECK-NEXT: shl.b64 %rd75, %rd2, %r6; +; CHECK-NEXT: or.b64 %rd34, %rd71, %rd72; +; CHECK-NEXT: setp.eq.b64 %p17, %rd34, 0; ; CHECK-NEXT: mov.b64 %rd69, %rd70; -; CHECK-NEXT: @%p16 bra $L__BB0_4; +; CHECK-NEXT: @%p17 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader ; CHECK-NEXT: cvt.u32.u64 %r9, %rd71; ; CHECK-NEXT: shr.u64 %rd35, %rd2, %r9; @@ -191,21 +191,21 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: // %bb.3: // %udiv-bb1 ; CHECK-NEXT: add.cc.s64 %rd58, %rd17, 1; ; CHECK-NEXT: addc.cc.s64 %rd59, %rd18, 0; -; CHECK-NEXT: or.b64 %rd21, %rd58, %rd59; -; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd22, %rd6, %r6; +; CHECK-NEXT: shl.b64 %rd21, %rd6, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd23, %rd5, %r7; -; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; +; CHECK-NEXT: shr.u64 %rd22, %rd5, %r7; +; CHECK-NEXT: or.b64 %rd23, %rd21, %rd22; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd25, %rd5, %r8; -; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd63, %rd25, %rd24, %p15; +; CHECK-NEXT: shl.b64 %rd24, %rd5, %r8; +; CHECK-NEXT: setp.gt.s32 %p14, %r6, 63; +; CHECK-NEXT: selp.b64 %rd63, %rd24, %rd23, %p14; ; CHECK-NEXT: shl.b64 %rd62, %rd5, %r6; +; CHECK-NEXT: or.b64 %rd25, %rd58, %rd59; +; CHECK-NEXT: setp.eq.b64 %p15, %rd25, 0; ; CHECK-NEXT: mov.b64 %rd56, %rd57; -; CHECK-NEXT: @%p14 bra $L__BB1_4; +; CHECK-NEXT: @%p15 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader ; CHECK-NEXT: cvt.u32.u64 %r9, %rd58; ; CHECK-NEXT: shr.u64 %rd26, %rd5, %r9; @@ -363,21 +363,21 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: // %bb.3: // %udiv-bb1 ; CHECK-NEXT: add.cc.s64 %rd66, %rd27, 1; ; CHECK-NEXT: addc.cc.s64 %rd67, %rd28, 0; -; CHECK-NEXT: or.b64 %rd31, %rd66, %rd67; -; CHECK-NEXT: setp.eq.b64 %p16, %rd31, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd27; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd32, %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd31, %rd2, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd33, %rd1, %r7; -; CHECK-NEXT: or.b64 %rd34, %rd32, %rd33; +; CHECK-NEXT: shr.u64 %rd32, %rd1, %r7; +; CHECK-NEXT: or.b64 %rd33, %rd31, %rd32; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd35, %rd1, %r8; -; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd71, %rd35, %rd34, %p17; +; CHECK-NEXT: shl.b64 %rd34, %rd1, %r8; +; CHECK-NEXT: setp.gt.s32 %p16, %r6, 63; +; CHECK-NEXT: selp.b64 %rd71, %rd34, %rd33, %p16; ; CHECK-NEXT: shl.b64 %rd70, %rd1, %r6; +; CHECK-NEXT: or.b64 %rd35, %rd66, %rd67; +; CHECK-NEXT: setp.eq.b64 %p17, %rd35, 0; ; CHECK-NEXT: mov.b64 %rd64, %rd65; -; CHECK-NEXT: @%p16 bra $L__BB4_4; +; CHECK-NEXT: @%p17 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader ; CHECK-NEXT: cvt.u32.u64 %r9, %rd66; ; CHECK-NEXT: shr.u64 %rd36, %rd1, %r9; @@ -487,21 +487,21 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: // %bb.3: // %udiv-bb1 ; CHECK-NEXT: add.cc.s64 %rd52, %rd17, 1; ; CHECK-NEXT: addc.cc.s64 %rd53, %rd18, 0; -; CHECK-NEXT: or.b64 %rd21, %rd52, %rd53; -; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd22, %rd4, %r6; +; CHECK-NEXT: shl.b64 %rd21, %rd4, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd23, %rd3, %r7; -; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; +; CHECK-NEXT: shr.u64 %rd22, %rd3, %r7; +; CHECK-NEXT: or.b64 %rd23, %rd21, %rd22; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd25, %rd3, %r8; -; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd57, %rd25, %rd24, %p15; +; CHECK-NEXT: shl.b64 %rd24, %rd3, %r8; +; CHECK-NEXT: setp.gt.s32 %p14, %r6, 63; +; CHECK-NEXT: selp.b64 %rd57, %rd24, %rd23, %p14; ; CHECK-NEXT: shl.b64 %rd56, %rd3, %r6; +; CHECK-NEXT: or.b64 %rd25, %rd52, %rd53; +; CHECK-NEXT: setp.eq.b64 %p15, %rd25, 0; ; CHECK-NEXT: mov.b64 %rd50, %rd51; -; CHECK-NEXT: @%p14 bra $L__BB5_4; +; CHECK-NEXT: @%p15 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader ; CHECK-NEXT: cvt.u32.u64 %r9, %rd52; ; CHECK-NEXT: shr.u64 %rd26, %rd3, %r9; diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll index 4dc6d0ad3d5c7..05fe11026cc59 100644 --- a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll @@ -370,42 +370,42 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-LABEL: atomicrmw_udec_wrap_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: sync -; CHECK-NEXT: ld 6, 0(3) +; CHECK-NEXT: ld 8, 0(3) +; CHECK-NEXT: li 6, 1 +; CHECK-NEXT: li 7, 0 ; CHECK-NEXT: .LBB7_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Loop Header: Depth=1 -; CHECK-NEXT: # Child Loop BB7_4 Depth 2 -; CHECK-NEXT: cmpdi 6, 0 -; CHECK-NEXT: mr 7, 4 -; CHECK-NEXT: bc 12, 2, .LBB7_4 +; CHECK-NEXT: # Child Loop BB7_3 Depth 2 +; CHECK-NEXT: subc 5, 8, 6 +; CHECK-NEXT: addze. 9, 7 +; CHECK-NEXT: cmpld 1, 8, 4 +; CHECK-NEXT: cror 20, 2, 5 +; CHECK-NEXT: mr 9, 4 +; CHECK-NEXT: bc 12, 20, .LBB7_3 ; CHECK-NEXT: # %bb.2: # %atomicrmw.start ; CHECK-NEXT: # -; CHECK-NEXT: cmpld 6, 4 -; CHECK-NEXT: mr 7, 4 -; CHECK-NEXT: bc 12, 1, .LBB7_4 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # -; CHECK-NEXT: addi 7, 6, -1 -; CHECK-NEXT: .LBB7_4: # %cmpxchg.start +; CHECK-NEXT: mr 9, 5 +; CHECK-NEXT: .LBB7_3: # %cmpxchg.start ; CHECK-NEXT: # Parent Loop BB7_1 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldarx 5, 0, 3 -; CHECK-NEXT: cmpld 5, 6 -; CHECK-NEXT: bne- 0, .LBB7_7 -; CHECK-NEXT: # %bb.5: # %cmpxchg.fencedstore +; CHECK-NEXT: cmpld 5, 8 +; CHECK-NEXT: bne- 0, .LBB7_6 +; CHECK-NEXT: # %bb.4: # %cmpxchg.fencedstore ; CHECK-NEXT: # -; CHECK-NEXT: stdcx. 7, 0, 3 +; CHECK-NEXT: stdcx. 9, 0, 3 ; CHECK-NEXT: creqv 20, 20, 20 -; CHECK-NEXT: bne- 0, .LBB7_4 -; CHECK-NEXT: # %bb.6: # %cmpxchg.end +; CHECK-NEXT: bne- 0, .LBB7_3 +; CHECK-NEXT: # %bb.5: # %cmpxchg.end ; CHECK-NEXT: # -; CHECK-NEXT: mr 6, 5 +; CHECK-NEXT: mr 8, 5 ; CHECK-NEXT: bc 4, 20, .LBB7_1 -; CHECK-NEXT: b .LBB7_8 -; CHECK-NEXT: .LBB7_7: # %cmpxchg.nostore +; CHECK-NEXT: b .LBB7_7 +; CHECK-NEXT: .LBB7_6: # %cmpxchg.nostore ; CHECK-NEXT: # -; CHECK-NEXT: mr 6, 5 +; CHECK-NEXT: mr 8, 5 ; CHECK-NEXT: b .LBB7_1 -; CHECK-NEXT: .LBB7_8: # %atomicrmw.end +; CHECK-NEXT: .LBB7_7: # %atomicrmw.end ; CHECK-NEXT: mr 3, 5 ; CHECK-NEXT: lwsync ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index 4e04f38a6301d..edc950cec9b8d 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -1121,14 +1121,15 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; RV32I-NEXT: bnez a0, .LBB6_4 ; RV32I-NEXT: .LBB6_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: seqz a0, a3 -; RV32I-NEXT: sltu a1, s1, a3 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: addi a0, a3, -1 +; RV32I-NEXT: sltu a1, a3, a0 +; RV32I-NEXT: sltu a2, s1, a3 +; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: mv a2, s1 -; RV32I-NEXT: bnez a0, .LBB6_1 +; RV32I-NEXT: bnez a1, .LBB6_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB6_2 Depth=1 -; RV32I-NEXT: addi a2, a3, -1 +; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: j .LBB6_1 ; RV32I-NEXT: .LBB6_4: # %atomicrmw.end ; RV32I-NEXT: mv a0, a3 @@ -1164,14 +1165,15 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; RV32IA-NEXT: # =>This Loop Header: Depth=1 ; RV32IA-NEXT: # Child Loop BB6_5 Depth 2 ; RV32IA-NEXT: mv a3, a2 -; RV32IA-NEXT: seqz a2, a2 -; RV32IA-NEXT: sltu a4, a1, a3 -; RV32IA-NEXT: or a2, a2, a4 +; RV32IA-NEXT: addi a2, a2, -1 +; RV32IA-NEXT: sltu a4, a3, a2 +; RV32IA-NEXT: sltu a5, a1, a3 +; RV32IA-NEXT: or a5, a4, a5 ; RV32IA-NEXT: mv a4, a1 -; RV32IA-NEXT: bnez a2, .LBB6_1 +; RV32IA-NEXT: bnez a5, .LBB6_1 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB6_2 Depth=1 -; RV32IA-NEXT: addi a4, a3, -1 +; RV32IA-NEXT: mv a4, a2 ; RV32IA-NEXT: j .LBB6_1 ; RV32IA-NEXT: .LBB6_4: # %atomicrmw.end ; RV32IA-NEXT: mv a0, a2 @@ -1431,14 +1433,15 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV64I-NEXT: bnez a0, .LBB7_4 ; RV64I-NEXT: .LBB7_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: seqz a0, a3 -; RV64I-NEXT: sltu a1, s1, a3 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addi a0, a3, -1 +; RV64I-NEXT: sltu a1, a3, a0 +; RV64I-NEXT: sltu a2, s1, a3 +; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: mv a2, s1 -; RV64I-NEXT: bnez a0, .LBB7_1 +; RV64I-NEXT: bnez a1, .LBB7_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV64I-NEXT: addi a2, a3, -1 +; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: j .LBB7_1 ; RV64I-NEXT: .LBB7_4: # %atomicrmw.end ; RV64I-NEXT: mv a0, a3 @@ -1474,14 +1477,15 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB7_5 Depth 2 ; RV64IA-NEXT: mv a3, a2 -; RV64IA-NEXT: seqz a2, a2 -; RV64IA-NEXT: sltu a4, a1, a3 -; RV64IA-NEXT: or a2, a2, a4 +; RV64IA-NEXT: addi a2, a2, -1 +; RV64IA-NEXT: sltu a4, a3, a2 +; RV64IA-NEXT: sltu a5, a1, a3 +; RV64IA-NEXT: or a5, a4, a5 ; RV64IA-NEXT: mv a4, a1 -; RV64IA-NEXT: bnez a2, .LBB7_1 +; RV64IA-NEXT: bnez a5, .LBB7_1 ; RV64IA-NEXT: # %bb.3: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV64IA-NEXT: addi a4, a3, -1 +; RV64IA-NEXT: mv a4, a2 ; RV64IA-NEXT: j .LBB7_1 ; RV64IA-NEXT: .LBB7_4: # %atomicrmw.end ; RV64IA-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll index 02aeebdeb3775..b1d396d70ff5f 100644 --- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll +++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll @@ -8,19 +8,21 @@ define i32 @optbranch_32(i32 %Arg) { ; RV32-LABEL: optbranch_32: ; RV32: # %bb.0: # %bb ; RV32-NEXT: addi a0, a0, 1 -; RV32-NEXT: bnez a0, .LBB0_2 -; RV32-NEXT: # %bb.1: # %bb2 +; RV32-NEXT: beqz a0, .LBB0_2 +; RV32-NEXT: # %bb.1: # %bb3 +; RV32-NEXT: ret +; RV32-NEXT: .LBB0_2: # %bb2 ; RV32-NEXT: li a0, -1 -; RV32-NEXT: .LBB0_2: # %bb3 ; RV32-NEXT: ret ; ; RV64-LABEL: optbranch_32: ; RV64: # %bb.0: # %bb ; RV64-NEXT: addiw a0, a0, 1 -; RV64-NEXT: bnez a0, .LBB0_2 -; RV64-NEXT: # %bb.1: # %bb2 +; RV64-NEXT: beqz a0, .LBB0_2 +; RV64-NEXT: # %bb.1: # %bb3 +; RV64-NEXT: ret +; RV64-NEXT: .LBB0_2: # %bb2 ; RV64-NEXT: li a0, -1 -; RV64-NEXT: .LBB0_2: # %bb3 ; RV64-NEXT: ret bb: %i1 = icmp eq i32 %Arg, -1 @@ -41,20 +43,22 @@ define i64 @optbranch_64(i64 %Arg) { ; RV32-NEXT: seqz a2, a0 ; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: or a2, a0, a1 -; RV32-NEXT: bnez a2, .LBB1_2 -; RV32-NEXT: # %bb.1: # %bb2 +; RV32-NEXT: beqz a2, .LBB1_2 +; RV32-NEXT: # %bb.1: # %bb3 +; RV32-NEXT: ret +; RV32-NEXT: .LBB1_2: # %bb2 ; RV32-NEXT: li a0, -1 ; RV32-NEXT: li a1, -1 -; RV32-NEXT: .LBB1_2: # %bb3 ; RV32-NEXT: ret ; ; RV64-LABEL: optbranch_64: ; RV64: # %bb.0: # %bb ; RV64-NEXT: addi a0, a0, 1 -; RV64-NEXT: bnez a0, .LBB1_2 -; RV64-NEXT: # %bb.1: # %bb2 +; RV64-NEXT: beqz a0, .LBB1_2 +; RV64-NEXT: # %bb.1: # %bb3 +; RV64-NEXT: ret +; RV64-NEXT: .LBB1_2: # %bb2 ; RV64-NEXT: li a0, -1 -; RV64-NEXT: .LBB1_2: # %bb3 ; RV64-NEXT: ret bb: %i1 = icmp eq i64 %Arg, -1 diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index ba6769b2aa3e1..db29a828a2c13 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -792,24 +792,26 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) { define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) { ; RV32-LABEL: usubo_ult_i64_math_overflow_used: ; RV32: # %bb.0: -; RV32-NEXT: mv a5, a0 -; RV32-NEXT: sltu a0, a0, a2 -; RV32-NEXT: sub a6, a1, a3 -; RV32-NEXT: sub a5, a5, a2 -; RV32-NEXT: sub a2, a6, a0 -; RV32-NEXT: sw a5, 0(a4) -; RV32-NEXT: sw a2, 4(a4) -; RV32-NEXT: beq a1, a3, .LBB23_2 +; RV32-NEXT: sltu a5, a0, a2 +; RV32-NEXT: sub a3, a1, a3 +; RV32-NEXT: sub a3, a3, a5 +; RV32-NEXT: sub a2, a0, a2 +; RV32-NEXT: beq a3, a1, .LBB23_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: sltu a0, a1, a3 +; RV32-NEXT: j .LBB23_3 ; RV32-NEXT: .LBB23_2: +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: .LBB23_3: +; RV32-NEXT: sw a2, 0(a4) +; RV32-NEXT: sw a3, 4(a4) ; RV32-NEXT: ret ; ; RV64-LABEL: usubo_ult_i64_math_overflow_used: ; RV64: # %bb.0: -; RV64-NEXT: sub a3, a0, a1 +; RV64-NEXT: sub a1, a0, a1 ; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: sd a3, 0(a2) +; RV64-NEXT: sd a1, 0(a2) ; RV64-NEXT: ret %s = sub i64 %x, %y store i64 %s, ptr %p @@ -822,20 +824,17 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) { define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) { ; RV32-LABEL: usubo_ugt_i32: ; RV32: # %bb.0: -; RV32-NEXT: sltu a3, a0, a1 -; RV32-NEXT: sub a0, a0, a1 -; RV32-NEXT: sw a0, 0(a2) -; RV32-NEXT: mv a0, a3 +; RV32-NEXT: sub a1, a0, a1 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: sw a1, 0(a2) ; RV32-NEXT: ret ; ; RV64-LABEL: usubo_ugt_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a3, a1 -; RV64-NEXT: sext.w a4, a0 -; RV64-NEXT: sltu a3, a4, a3 -; RV64-NEXT: sub a0, a0, a1 -; RV64-NEXT: sw a0, 0(a2) -; RV64-NEXT: mv a0, a3 +; RV64-NEXT: subw a1, a0, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: sw a1, 0(a2) ; RV64-NEXT: ret %ov = icmp ugt i32 %y, %x %s = sub i32 %x, %y @@ -957,16 +956,16 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) { ; RV32-LABEL: usubo_eq_constant1_op1_i32: ; RV32: # %bb.0: ; RV32-NEXT: addi a2, a0, -1 -; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sw a2, 0(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: usubo_eq_constant1_op1_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a2, a0 -; RV64-NEXT: addi a3, a0, -1 -; RV64-NEXT: seqz a0, a2 -; RV64-NEXT: sw a3, 0(a1) +; RV64-NEXT: addiw a2, a0, -1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sw a2, 0(a1) ; RV64-NEXT: ret %s = add i32 %x, -1 %ov = icmp eq i32 %x, 0 @@ -980,16 +979,15 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, ptr %p) { ; RV32-LABEL: usubo_ne_constant0_op1_i32: ; RV32: # %bb.0: ; RV32-NEXT: neg a2, a0 -; RV32-NEXT: snez a0, a0 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: sw a2, 0(a1) ; RV32-NEXT: ret ; ; RV64-LABEL: usubo_ne_constant0_op1_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a2, a0 -; RV64-NEXT: neg a3, a0 +; RV64-NEXT: negw a2, a0 ; RV64-NEXT: snez a0, a2 -; RV64-NEXT: sw a3, 0(a1) +; RV64-NEXT: sw a2, 0(a1) ; RV64-NEXT: ret %s = sub i32 0, %x %ov = icmp ne i32 %x, 0 @@ -1076,41 +1074,43 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s5, a5 -; RV32-NEXT: mv s3, a1 +; RV32-NEXT: mv s2, a5 +; RV32-NEXT: mv s1, a1 ; RV32-NEXT: andi a1, a5, 1 -; RV32-NEXT: beqz a1, .LBB32_8 +; RV32-NEXT: beqz a1, .LBB32_7 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 -; RV32-NEXT: mv s2, a3 -; RV32-NEXT: mv s1, a2 +; RV32-NEXT: mv s5, a3 +; RV32-NEXT: mv s3, a2 ; RV32-NEXT: mv s4, a0 -; RV32-NEXT: beq s3, a3, .LBB32_3 +; RV32-NEXT: beq s1, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s3, s2 +; RV32-NEXT: sltu s6, s1, s5 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s4, s1 +; RV32-NEXT: sltu s6, s4, s3 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call -; RV32-NEXT: beqz s6, .LBB32_8 +; RV32-NEXT: beqz s6, .LBB32_7 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s4, s1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s3, s2, .LBB32_7 +; RV32-NEXT: sltu a0, s4, s3 +; RV32-NEXT: sub a1, s1, s5 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sub a2, s4, s3 +; RV32-NEXT: beq a1, s1, .LBB32_8 ; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s3, s2 -; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s3, s2 -; RV32-NEXT: sub a3, s4, s1 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sw a3, 0(s0) -; RV32-NEXT: sw a2, 4(s0) +; RV32-NEXT: sltu a0, s1, a1 ; RV32-NEXT: j .LBB32_9 -; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s5 -; RV32-NEXT: .LBB32_9: # %f +; RV32-NEXT: .LBB32_7: # %f +; RV32-NEXT: mv a0, s2 +; RV32-NEXT: j .LBB32_10 +; RV32-NEXT: .LBB32_8: +; RV32-NEXT: sltu a0, s4, a2 +; RV32-NEXT: .LBB32_9: # %end +; RV32-NEXT: sw a2, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: .LBB32_10: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1140,44 +1140,39 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 ; RV64-NEXT: .cfi_offset s1, -24 ; RV64-NEXT: .cfi_offset s2, -32 ; RV64-NEXT: .cfi_offset s3, -40 -; RV64-NEXT: .cfi_offset s4, -48 -; RV64-NEXT: mv s0, a3 +; RV64-NEXT: mv s1, a3 ; RV64-NEXT: mv s2, a1 ; RV64-NEXT: andi a1, a3, 1 ; RV64-NEXT: beqz a1, .LBB32_3 ; RV64-NEXT: # %bb.1: # %t -; RV64-NEXT: mv s1, a2 +; RV64-NEXT: mv s0, a2 ; RV64-NEXT: mv s3, a0 -; RV64-NEXT: sltu s4, a0, s2 -; RV64-NEXT: mv a0, s4 +; RV64-NEXT: sltu a0, a0, s2 ; RV64-NEXT: call call ; RV64-NEXT: bgeu s3, s2, .LBB32_3 ; RV64-NEXT: # %bb.2: # %end -; RV64-NEXT: sub a0, s3, s2 -; RV64-NEXT: sd a0, 0(s1) -; RV64-NEXT: mv a0, s4 +; RV64-NEXT: sub a1, s3, s2 +; RV64-NEXT: sltu a0, s3, a1 +; RV64-NEXT: sd a1, 0(s0) ; RV64-NEXT: j .LBB32_4 ; RV64-NEXT: .LBB32_3: # %f -; RV64-NEXT: mv a0, s0 +; RV64-NEXT: mv a0, s1 ; RV64-NEXT: .LBB32_4: # %f ; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore ra ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 ; RV64-NEXT: .cfi_restore s3 -; RV64-NEXT: .cfi_restore s4 ; RV64-NEXT: addi sp, sp, 48 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll index 2db620dab8801..0b389e3a26c78 100644 --- a/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll +++ b/llvm/test/CodeGen/SPIRV/optimizations/add-check-overflow.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; This test aims to check ability to support "Arithmetic with Overflow" intrinsics ; in the special case when those intrinsics are being generated by the CodeGenPrepare; ; pass during translations with optimization (note -disable-lsr, to inhibit @@ -89,3 +90,6 @@ l1: exit: ret i32 %i } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} +; NOLSR: {{.*}} diff --git a/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll b/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll index 31e54c43c1e5f..4c92a00020475 100644 --- a/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll +++ b/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll @@ -11,27 +11,20 @@ define i64 @f(i64 %x2, i32 %z) { ; CHECK-LABEL: f: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: subs r3, r0, #1 -; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: sbcs r3, r2 -; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: orrs r2, r1 +; CHECK-NEXT: rsbs r3, r2, #0 ; CHECK-NEXT: adcs r3, r2 -; CHECK-NEXT: movs r4, #30 -; CHECK-NEXT: subs r5, r0, #1 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: sbcs r5, r2 -; CHECK-NEXT: adcs r4, r2 -; CHECK-NEXT: lsls r2, r1, #1 -; CHECK-NEXT: lsls r2, r4 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: eors r4, r3 -; CHECK-NEXT: lsrs r0, r4 -; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: lsrs r1, r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: lsrs r0, r3 +; CHECK-NEXT: movs r2, #31 +; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: lsls r4, r1, #1 +; CHECK-NEXT: lsls r4, r2 +; CHECK-NEXT: orrs r0, r4 +; CHECK-NEXT: lsrs r1, r3 +; CHECK-NEXT: pop {r4, pc} %x3 = add nsw i64 %x2, -1 %x8 = icmp ne i64 %x2, 0 %x9 = xor i1 %x8, true diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 455b72d16a075..7236b2c3eec5d 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -152,14 +152,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $176, %esp -; X86-NEXT: movl 32(%ebp), %edx -; X86-NEXT: movl 36(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: movl %ecx, %edi ; X86-NEXT: xorl %eax, %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl 28(%ebp), %edx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl 24(%ebp), %ecx @@ -172,26 +172,27 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%ebp), %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl 52(%ebp), %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %esi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: xorl %edx, %edi ; X86-NEXT: movl 48(%ebp), %ecx ; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: movl 44(%ebp), %ebx ; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: movl 40(%ebp), %edi -; X86-NEXT: xorl %edx, %edi -; X86-NEXT: subl %edx, %edi +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: subl %edx, %esi ; X86-NEXT: sbbl %edx, %ebx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl @@ -203,92 +204,99 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx -; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: bsrl %edi, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: bsrl %edi, %edi -; X86-NEXT: xorl $31, %edi -; X86-NEXT: orl $32, %edi -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnel %edx, %edi -; X86-NEXT: orl $64, %edi -; X86-NEXT: movl %eax, %edx +; X86-NEXT: orl $32, %edx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: bsrl %ebx, %eax +; X86-NEXT: xorl $31, %eax ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %edx -; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %edx -; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: bsrl %esi, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovnel %eax, %ecx +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %esi ; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: orl $32, %edx -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: orl $64, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl %eax, %esi -; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %edi +; X86-NEXT: xorl $31, %edi +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: testl %eax, %eax +; X86-NEXT: cmovnel %edi, %esi +; X86-NEXT: orl $64, %esi +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmovnel %edx, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl %edx, %edi -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: subl %esi, %ecx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: movl $127, %ecx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edi, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: movl $127, %edx +; X86-NEXT: cmpl %ecx, %edx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl $0, %edx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: setb %dl +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: cmovnel %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %edi ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: xorl $127, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: je .LBB4_9 ; X86-NEXT: # %bb.5: # %udiv-bb1 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: xorps %xmm0, %xmm0 @@ -299,8 +307,6 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: xorb $127, %cl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al @@ -310,251 +316,245 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl 152(%esp,%eax), %esi ; X86-NEXT: movl 156(%esp,%eax), %edx ; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 144(%esp,%eax), %edx -; X86-NEXT: movl 148(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl $1, %ebx +; X86-NEXT: movl 144(%esp,%eax), %ebx +; X86-NEXT: movl 148(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl $0, %edx -; X86-NEXT: jae .LBB4_2 -; X86-NEXT: # %bb.6: -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: jmp .LBB4_7 -; X86-NEXT: .LBB4_1: -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: jmp .LBB4_9 -; X86-NEXT: .LBB4_2: # %udiv-preheader -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: je .LBB4_6 +; X86-NEXT: # %bb.2: # %udiv-preheader ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 108(%esp,%eax), %edx +; X86-NEXT: movl 108(%esp,%eax), %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 104(%esp,%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%eax), %edi +; X86-NEXT: movl 100(%esp,%eax), %eax ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 104(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: shrdl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%eax), %esi -; X86-NEXT: movl 100(%esp,%eax), %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %edi -; X86-NEXT: shrdl %cl, %ebx, %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shrdl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrdl %cl, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: orl %ecx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl $1, %ebx, %eax +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %eax, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %edx -; X86-NEXT: shldl $1, %ecx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edi, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $1, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $-1, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jmp .LBB4_7 +; X86-NEXT: .LBB4_1: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: jmp .LBB4_9 +; X86-NEXT: .LBB4_6: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %ebx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: shldl $1, %eax, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: shldl $1, %edi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: shldl $1, %esi, %edx ; X86-NEXT: orl %ecx, %edx +; X86-NEXT: shldl $1, %edi, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: addl %ebx, %ebx +; X86-NEXT: orl %eax, %ebx ; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: sbbl %ecx, %eax -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%ebp), %ecx -; X86-NEXT: movl %edx, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %ebx, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebp), %ecx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: xorl %eax, %ebx +; X86-NEXT: subl %eax, %ebx +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %ecx +; X86-NEXT: movl 56(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl 40(%ebp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull 40(%ebp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 44(%ebp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: setb %bl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %esi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl 40(%ebp), %eax -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %edi +; X86-NEXT: imull %eax, %ecx +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull %esi, %edi +; X86-NEXT: imull %ebx, %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movl 48(%ebp), %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: movl 52(%ebp), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: imull %ecx, %edi +; X86-NEXT: mull %ecx ; X86-NEXT: addl %edx, %edi ; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movl 48(%ebp), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 52(%ebp), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebx -; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %esi, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl 24(%ebp), %edx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 28(%ebp), %ecx -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 32(%ebp), %edi -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 36(%ebp), %esi -; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: sbbl %edi, %ebx ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 859e9244d29d2..199cae7f563b3 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -152,12 +152,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $160, %esp -; X86-NEXT: movl 40(%ebp), %ebx -; X86-NEXT: movl 52(%ebp), %esi +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 52(%ebp), %ebx ; X86-NEXT: movl 44(%ebp), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: orl 48(%ebp), %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl @@ -169,161 +168,157 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx -; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl 48(%ebp), %ecx +; X86-NEXT: bsrl %ebx, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: bsrl 48(%ebp), %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %ebx, %eax +; X86-NEXT: orl $32, %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: bsrl 40(%ebp), %eax ; X86-NEXT: xorl $31, %eax ; X86-NEXT: orl $32, %eax ; X86-NEXT: testl %edi, %edi -; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: cmovnel %esi, %eax ; X86-NEXT: orl $64, %eax -; X86-NEXT: movl 48(%ebp), %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: cmovnel %ecx, %eax -; X86-NEXT: movl 36(%ebp), %ebx -; X86-NEXT: bsrl %ebx, %edx -; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl 32(%ebp), %ecx -; X86-NEXT: bsrl %ecx, %ecx -; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: movl 36(%ebp), %edi ; X86-NEXT: bsrl %edi, %esi ; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl 24(%ebp), %edx +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: bsrl %ecx, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: orl $32, %edx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: orl $64, %edx -; X86-NEXT: movl 32(%ebp), %esi -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl %edx, %eax +; X86-NEXT: movl 28(%ebp), %ebx +; X86-NEXT: bsrl %ebx, %edi +; X86-NEXT: xorl $31, %edi +; X86-NEXT: bsrl 24(%ebp), %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovnel %edi, %esi +; X86-NEXT: orl $64, %esi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: orl 36(%ebp), %edi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: cmovnel %edx, %esi +; X86-NEXT: subl %esi, %eax +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx ; X86-NEXT: movl $0, %ebx ; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: sbbl %ecx, %ecx -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movl $127, %edx +; X86-NEXT: movl $127, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: movl $0, %edx +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: movl $0, %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl $0, %esi ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl $0, %esi ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ecx, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: setb %dl -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: movl 36(%ebp), %eax -; X86-NEXT: cmovnel %edi, %eax -; X86-NEXT: movl 32(%ebp), %esi -; X86-NEXT: cmovnel %edi, %esi -; X86-NEXT: movl 28(%ebp), %edx -; X86-NEXT: cmovnel %edi, %edx -; X86-NEXT: movl 24(%ebp), %ebx -; X86-NEXT: cmovnel %edi, %ebx -; X86-NEXT: movl 56(%ebp), %edi -; X86-NEXT: jne .LBB4_8 -; X86-NEXT: # %bb.1: # %_udiv-special-cases -; X86-NEXT: movl %eax, %edi +; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: setb %ah +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Folded Reload +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovnel %ecx, %ebx +; X86-NEXT: cmovnel %ecx, %edi +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: cmovnel %ecx, %esi +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # %bb.8: # %_udiv-special-cases ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: xorl $127, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: je .LBB4_9 +; X86-NEXT: # %bb.5: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl 56(%ebp), %edi -; X86-NEXT: movl 24(%ebp), %ecx -; X86-NEXT: je .LBB4_8 -; X86-NEXT: # %bb.2: # %udiv-bb1 -; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: xorps %xmm0, %xmm0 ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 28(%ebp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 32(%ebp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 36(%ebp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: xorb $127, %cl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 136(%esp,%eax), %esi -; X86-NEXT: movl 140(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 136(%esp,%eax), %edi +; X86-NEXT: movl 140(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 128(%esp,%eax), %ebx -; X86-NEXT: movl 132(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: jae .LBB4_3 -; X86-NEXT: # %bb.6: -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: jmp .LBB4_7 -; X86-NEXT: .LBB4_3: # %udiv-preheader +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: je .LBB4_6 +; X86-NEXT: # %bb.2: # %udiv-preheader ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%ebp), %edi -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl 28(%ebp), %edi -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl 32(%ebp), %edi -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl 36(%ebp), %edi -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 92(%esp,%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esp,%eax), %esi +; X86-NEXT: movl 88(%esp,%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shrdl %cl, %esi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esp,%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shrdl %cl, %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 80(%esp,%eax), %edx ; X86-NEXT: movl 84(%esp,%eax), %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shrdl %cl, %edi, %ebx -; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl %cl, %eax, %edi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shrdl %cl, %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 40(%ebp), %eax @@ -338,41 +333,46 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl 52(%ebp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: .p2align 4 -; X86-NEXT: .LBB4_4: # %udiv-do-while +; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl $1, %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: shldl $1, %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %esi -; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax @@ -386,94 +386,100 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: andl 40(%ebp), %ecx ; X86-NEXT: subl %ecx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: adcl $-1, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: jne .LBB4_4 -; X86-NEXT: # %bb.5: +; X86-NEXT: jne .LBB4_3 +; X86-NEXT: # %bb.4: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: jmp .LBB4_7 +; X86-NEXT: .LBB4_1: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: jmp .LBB4_9 +; X86-NEXT: .LBB4_6: +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl 56(%ebp), %edi ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: shldl $1, %edx, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: shldl $1, %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: shldl $1, %esi, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: addl %ebx, %ebx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: .LBB4_8: # %udiv-end -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: .LBB4_9: # %udiv-end ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, (%edi) -; X86-NEXT: movl %edx, 4(%edi) -; X86-NEXT: movl %esi, 8(%edi) -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl 56(%ebp), %eax +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: movl 48(%ebp), %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, %esi ; X86-NEXT: imull %edx, %esi -; X86-NEXT: mull %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: movl 52(%ebp), %edi -; X86-NEXT: imull %ebx, %edi +; X86-NEXT: imull %ecx, %edi ; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl 40(%ebp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull 40(%ebp), %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl 44(%ebp), %eax -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl 44(%ebp), %ebx +; X86-NEXT: imull %ebx, %esi +; X86-NEXT: addl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull 44(%ebp) -; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl 32(%ebp), %ebx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull 44(%ebp) @@ -481,19 +487,19 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 24(%ebp), %ebx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 32(%ebp), %edi -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 36(%ebp), %ecx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll index f72679f55e114..f114d9a2fd192 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/overflow-intrinsics.ll @@ -15,6 +15,16 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo1_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG14:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG14]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG14]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META9:![0-9]+]], !DIExpression(), [[DBG14]]) +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META11:![0-9]+]], !DIExpression(), [[META15:![0-9]+]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG16:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META13:![0-9]+]], !DIExpression(), [[DBG16]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG17:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %a @@ -28,8 +38,19 @@ define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 -; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]] +; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo1_math_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG23:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG23]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG23]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META20:![0-9]+]], !DIExpression(), [[DBG23]]) +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META21:![0-9]+]], !DIExpression(), [[META24:![0-9]+]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG25:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META22:![0-9]+]], !DIExpression(), [[DBG25]]) +; DEBUG-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8, !dbg [[DBG26:![0-9]+]] +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG27:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %a @@ -45,6 +66,16 @@ define i64 @uaddo2_overflow_used(i64 %a, i64 %b) nounwind ssp { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo2_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG33:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG33]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG33]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META30:![0-9]+]], !DIExpression(), [[DBG33]]) +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META31:![0-9]+]], !DIExpression(), [[META34:![0-9]+]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG35:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META32:![0-9]+]], !DIExpression(), [[DBG35]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG36:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %b @@ -58,8 +89,19 @@ define i64 @uaddo2_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 -; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]] +; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo2_math_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG42:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG42]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG42]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META39:![0-9]+]], !DIExpression(), [[DBG42]]) +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META40:![0-9]+]], !DIExpression(), [[META43:![0-9]+]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG44:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META41:![0-9]+]], !DIExpression(), [[DBG44]]) +; DEBUG-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8, !dbg [[DBG45:![0-9]+]] +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG46:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %b @@ -75,6 +117,16 @@ define i64 @uaddo3_overflow_used(i64 %a, i64 %b) nounwind ssp { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo3_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG52:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG52]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG52]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META49:![0-9]+]], !DIExpression(), [[DBG52]]) +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META50:![0-9]+]], !DIExpression(), [[META53:![0-9]+]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG54:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META51:![0-9]+]], !DIExpression(), [[DBG54]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG55:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ugt i64 %b, %add @@ -88,8 +140,19 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 -; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]] +; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo3_math_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]), !dbg [[DBG61:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG61]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG61]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META58:![0-9]+]], !DIExpression(), [[DBG61]]) +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META59:![0-9]+]], !DIExpression(), [[META62:![0-9]+]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG63:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META60:![0-9]+]], !DIExpression(), [[DBG63]]) +; DEBUG-NEXT: store i64 [[MATH]], ptr [[RES:%.*]], align 8, !dbg [[DBG64:![0-9]+]] +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG65:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ugt i64 %b, %add @@ -106,6 +169,15 @@ define i64 @uaddo6_xor(i64 %a, i64 %b) { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo6_xor( +; DEBUG-NEXT: #dbg_value(i64 poison, [[META68:![0-9]+]], !DIExpression(), [[META71:![0-9]+]]) +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]]), !dbg [[DBG72:![0-9]+]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG72]] +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META69:![0-9]+]], !DIExpression(), [[DBG72]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG73:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META70:![0-9]+]], !DIExpression(), [[DBG73]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG74:![0-9]+]] ; %x = xor i64 %a, -1 %cmp = icmp ult i64 %x, %b @@ -119,6 +191,15 @@ define i64 @uaddo6_xor_commuted(i64 %a, i64 %b) { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo6_xor_commuted( +; DEBUG-NEXT: #dbg_value(i64 poison, [[META77:![0-9]+]], !DIExpression(), [[META80:![0-9]+]]) +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]]), !dbg [[DBG81:![0-9]+]] +; DEBUG-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG81]] +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META78:![0-9]+]], !DIExpression(), [[DBG81]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42, !dbg [[DBG82:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META79:![0-9]+]], !DIExpression(), [[DBG82]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG83:![0-9]+]] ; %x = xor i64 %a, -1 %cmp = icmp ugt i64 %b, %x @@ -135,6 +216,16 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) { ; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42 ; CHECK-NEXT: call void @use(i64 [[X]]) ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo6_xor_multi_use( +; DEBUG-NEXT: [[X:%.*]] = xor i64 -1, [[A:%.*]], !dbg [[DBG89:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[X]], [[META86:![0-9]+]], !DIExpression(), [[DBG89]]) +; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[X]], [[B:%.*]], !dbg [[DBG90:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META87:![0-9]+]], !DIExpression(), [[DBG90]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG91:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META88:![0-9]+]], !DIExpression(), [[DBG91]]) +; DEBUG-NEXT: call void @use(i64 [[X]]), !dbg [[DBG92:![0-9]+]] +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG93:![0-9]+]] ; %x = xor i64 -1, %a %cmp = icmp ult i64 %x, %b @@ -145,9 +236,18 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) { define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: @usubo_ult_i64_overflow_used( -; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]) +; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 ; CHECK-NEXT: ret i1 [[OV]] +; +; DEBUG-LABEL: @usubo_ult_i64_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]), !dbg [[DBG98:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG98]] +; DEBUG-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG98]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META96:![0-9]+]], !DIExpression(), [[DBG98]]) +; DEBUG-NEXT: #dbg_value(i1 [[OV1]], [[META97:![0-9]+]], !DIExpression(), [[META99:![0-9]+]]) +; DEBUG-NEXT: ret i1 [[OV1]], !dbg [[DBG100:![0-9]+]] ; %s = sub i64 %x, %y %ov = icmp ult i64 %x, %y @@ -156,10 +256,20 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) { define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: @usubo_ult_i64_math_overflow_used( -; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: store i64 [[S]], ptr [[P:%.*]] -; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]) +; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 +; CHECK-NEXT: store i64 [[MATH]], ptr [[P:%.*]], align 8 ; CHECK-NEXT: ret i1 [[OV]] +; +; DEBUG-LABEL: @usubo_ult_i64_math_overflow_used( +; DEBUG-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]), !dbg [[DBG105:![0-9]+]] +; DEBUG-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0, !dbg [[DBG105]] +; DEBUG-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !dbg [[DBG105]] +; DEBUG-NEXT: #dbg_value(i64 [[MATH]], [[META103:![0-9]+]], !DIExpression(), [[DBG105]]) +; DEBUG-NEXT: store i64 [[MATH]], ptr [[P:%.*]], align 8, !dbg [[DBG106:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[OV1]], [[META104:![0-9]+]], !DIExpression(), [[META107:![0-9]+]]) +; DEBUG-NEXT: ret i1 [[OV1]], !dbg [[DBG108:![0-9]+]] ; %s = sub i64 %x, %y store i64 %s, ptr %p diff --git a/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll index ec60238cbf927..d0a3ca4daa02f 100644 --- a/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll +++ b/llvm/test/Transforms/CodeGenPrepare/SPARC/overflow-intrinsics.ll @@ -14,6 +14,15 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp { ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[A]] ; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo1_overflow_used( +; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG14:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META9:![0-9]+]], !DIExpression(), [[DBG14]]) +; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[A]], !dbg [[DBG15:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META11:![0-9]+]], !DIExpression(), [[DBG15]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG16:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META13:![0-9]+]], !DIExpression(), [[DBG16]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG17:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %a @@ -23,12 +32,21 @@ define i64 @uaddo1_overflow_used(i64 %a, i64 %b) nounwind ssp { define i64 @uaddo1_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ; CHECK-LABEL: @uaddo1_math_overflow_used( -; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]) -; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 -; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]] +; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[ADD]], [[A]] ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 -; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]] +; CHECK-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo1_math_overflow_used( +; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG23:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META20:![0-9]+]], !DIExpression(), [[DBG23]]) +; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[A]], !dbg [[DBG24:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META21:![0-9]+]], !DIExpression(), [[DBG24]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG25:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META22:![0-9]+]], !DIExpression(), [[DBG25]]) +; DEBUG-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8, !dbg [[DBG26:![0-9]+]] +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG27:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %a @@ -43,6 +61,15 @@ define i64 @uaddo2_overflow_used(i64 %a, i64 %b) nounwind ssp { ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[B]] ; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo2_overflow_used( +; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG33:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META30:![0-9]+]], !DIExpression(), [[DBG33]]) +; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[B]], !dbg [[DBG34:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META31:![0-9]+]], !DIExpression(), [[DBG34]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG35:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META32:![0-9]+]], !DIExpression(), [[DBG35]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG36:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %b @@ -52,12 +79,21 @@ define i64 @uaddo2_overflow_used(i64 %a, i64 %b) nounwind ssp { define i64 @uaddo2_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ; CHECK-LABEL: @uaddo2_math_overflow_used( -; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]) -; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 -; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]] +; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[ADD]], [[B]] ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 -; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]] +; CHECK-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo2_math_overflow_used( +; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG42:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META39:![0-9]+]], !DIExpression(), [[DBG42]]) +; DEBUG-NEXT: [[CMP:%.*]] = icmp ult i64 [[ADD]], [[B]], !dbg [[DBG43:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META40:![0-9]+]], !DIExpression(), [[DBG43]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG44:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META41:![0-9]+]], !DIExpression(), [[DBG44]]) +; DEBUG-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8, !dbg [[DBG45:![0-9]+]] +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG46:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ult i64 %add, %b @@ -72,6 +108,15 @@ define i64 @uaddo3_overflow_used(i64 %a, i64 %b) nounwind ssp { ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[B]], [[ADD]] ; CHECK-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo3_overflow_used( +; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG52:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META49:![0-9]+]], !DIExpression(), [[DBG52]]) +; DEBUG-NEXT: [[CMP:%.*]] = icmp ugt i64 [[B]], [[ADD]], !dbg [[DBG53:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META50:![0-9]+]], !DIExpression(), [[DBG53]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG54:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META51:![0-9]+]], !DIExpression(), [[DBG54]]) +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG55:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ugt i64 %b, %add @@ -81,12 +126,21 @@ define i64 @uaddo3_overflow_used(i64 %a, i64 %b) nounwind ssp { define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ; CHECK-LABEL: @uaddo3_math_overflow_used( -; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]) -; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0 -; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]] +; CHECK-NEXT: [[OV:%.*]] = icmp ugt i64 [[B]], [[ADD]] ; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 -; CHECK-NEXT: store i64 [[MATH]], ptr [[RES:%.*]] +; CHECK-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8 ; CHECK-NEXT: ret i64 [[Q]] +; +; DEBUG-LABEL: @uaddo3_math_overflow_used( +; DEBUG-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]], !dbg [[DBG61:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[ADD]], [[META58:![0-9]+]], !DIExpression(), [[DBG61]]) +; DEBUG-NEXT: [[CMP:%.*]] = icmp ugt i64 [[B]], [[ADD]], !dbg [[DBG62:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[CMP]], [[META59:![0-9]+]], !DIExpression(), [[DBG62]]) +; DEBUG-NEXT: [[Q:%.*]] = select i1 [[CMP]], i64 [[B]], i64 42, !dbg [[DBG63:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[Q]], [[META60:![0-9]+]], !DIExpression(), [[DBG63]]) +; DEBUG-NEXT: store i64 [[ADD]], ptr [[RES:%.*]], align 8, !dbg [[DBG64:![0-9]+]] +; DEBUG-NEXT: ret i64 [[Q]], !dbg [[DBG65:![0-9]+]] ; %add = add i64 %b, %a %cmp = icmp ugt i64 %b, %add @@ -100,6 +154,13 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) { ; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]] ; CHECK-NEXT: ret i1 [[OV]] +; +; DEBUG-LABEL: @usubo_ult_i64_overflow_used( +; DEBUG-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]], !dbg [[DBG70:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[S]], [[META68:![0-9]+]], !DIExpression(), [[DBG70]]) +; DEBUG-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]], !dbg [[DBG71:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META69:![0-9]+]], !DIExpression(), [[DBG71]]) +; DEBUG-NEXT: ret i1 [[OV]], !dbg [[DBG72:![0-9]+]] ; %s = sub i64 %x, %y %ov = icmp ult i64 %x, %y @@ -109,9 +170,17 @@ define i1 @usubo_ult_i64_overflow_used(i64 %x, i64 %y, ptr %p) { define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: @usubo_ult_i64_math_overflow_used( ; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: store i64 [[S]], ptr [[P:%.*]] +; CHECK-NEXT: store i64 [[S]], ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]] ; CHECK-NEXT: ret i1 [[OV]] +; +; DEBUG-LABEL: @usubo_ult_i64_math_overflow_used( +; DEBUG-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]], !dbg [[DBG77:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i64 [[S]], [[META75:![0-9]+]], !DIExpression(), [[DBG77]]) +; DEBUG-NEXT: store i64 [[S]], ptr [[P:%.*]], align 8, !dbg [[DBG78:![0-9]+]] +; DEBUG-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]], !dbg [[DBG79:![0-9]+]] +; DEBUG-NEXT: #dbg_value(i1 [[OV]], [[META76:![0-9]+]], !DIExpression(), [[DBG79]]) +; DEBUG-NEXT: ret i1 [[OV]], !dbg [[DBG80:![0-9]+]] ; %s = sub i64 %x, %y store i64 %s, ptr %p