diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index af23060d6585c..198f120496949 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -124,10 +124,20 @@ class CombinerHelper { const TargetLowering &getTargetLowering() const; + /// \returns true if the combiner is running pre-legalization. + bool isPreLegalize() const; + + /// \returns true if \p Query is legal on the target. + bool isLegal(const LegalityQuery &Query) const; + /// \return true if the combine is running prior to legalization, or if \p /// Query is legal on the target. bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const; + /// \return true if the combine is running prior to legalization, or if \p Ty + /// is a legal integer constant type on the target. + bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const; + /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const; @@ -649,6 +659,10 @@ class CombinerHelper { /// (G_*MULO x, 0) -> 0 + no carry out bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Match: + /// (G_*ADDO x, 0) -> x + no carry out + bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Transform (fadd x, fneg(y)) -> (fsub x, y) /// (fadd fneg(x), y) -> (fsub y, x) /// (fsub x, fneg(y)) -> (fadd x, y) diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index c271eb0bf6ccc..ed31b6f10a31a 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -759,6 +759,12 @@ def mulo_by_0: GICombineRule< [{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +def addo_by_0: GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_UADDO, G_SADDO):$root, + [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + def mulh_to_lshr : GICombineRule< (defs root:$root), (match (wip_match_opcode G_UMULH):$root, @@ -868,7 +874,8 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, fneg_fneg_fold, right_identity_one]>; def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p, - overlapping_and, mulo_by_2, mulo_by_0]>; + overlapping_and, mulo_by_2, mulo_by_0, + addo_by_0]>; def known_bits_simplifications : GICombineGroup<[ redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index fba1adabd9e08..20677dcb0cd7a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -131,9 +131,27 @@ isBigEndian(const SmallDenseMap &MemOffset2Idx, return BigEndian; } +bool CombinerHelper::isPreLegalize() const { return !LI; } + +bool CombinerHelper::isLegal(const LegalityQuery &Query) const { + assert(LI && "Must have LegalizerInfo to query isLegal!"); + return LI->getAction(Query).Action == LegalizeActions::Legal; +} + bool CombinerHelper::isLegalOrBeforeLegalizer( const LegalityQuery &Query) const { - return !LI || LI->getAction(Query).Action == LegalizeActions::Legal; + return isPreLegalize() || isLegal(Query); +} + +bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const { + if (!Ty.isVector()) + return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}}); + // Vector constants are represented as a G_BUILD_VECTOR of scalar G_CONSTANTs. + if (isPreLegalize()) + return true; + LLT EltTy = Ty.getElementType(); + return isLegal({TargetOpcode::G_BUILD_VECTOR, {Ty, EltTy}}) && + isLegal({TargetOpcode::G_CONSTANT, {EltTy}}); } void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, @@ -4593,24 +4611,9 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) { return false; Register Dst = MI.getOperand(0).getReg(); Register Carry = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(Dst); - LLT CarryTy = MRI.getType(Carry); - if (DstTy.isVector()) { - LLT DstEltTy = DstTy.getElementType(); - if (!isLegalOrBeforeLegalizer( - {TargetOpcode::G_BUILD_VECTOR, {DstTy, DstEltTy}}) || - !isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstEltTy}})) - return false; - LLT CarryEltTy = CarryTy.getElementType(); - if (!isLegalOrBeforeLegalizer( - {TargetOpcode::G_BUILD_VECTOR, {CarryTy, CarryEltTy}}) || - !isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {CarryEltTy}})) - return false; - } else { - if (!isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstTy}}) || - !isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {CarryTy}})) - return false; - } + if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Dst)) || + !isConstantLegalOrBeforeLegalizer(MRI.getType(Carry))) + return false; MatchInfo = [=](MachineIRBuilder &B) { B.buildConstant(Dst, 0); B.buildConstant(Carry, 0); @@ -4618,6 +4621,24 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) { return true; } +bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) { + // (G_*ADDO x, 0) -> x + no carry out + unsigned Opc = MI.getOpcode(); + assert(Opc == TargetOpcode::G_UADDO || Opc == TargetOpcode::G_SADDO); + if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0))) + return false; + Register Carry = MI.getOperand(1).getReg(); + if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry))) + return false; + Register Dst = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(2).getReg(); + MatchInfo = [=](MachineIRBuilder &B) { + B.buildCopy(Dst, LHS); + B.buildConstant(Carry, 0); + }; + return true; +} + MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_UDIV); auto &UDiv = cast(MI); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir new file mode 100644 index 0000000000000..0a6b85bd45ddf --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir @@ -0,0 +1,136 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="addo_by_0" -global-isel -verify-machineinstrs %s -o - | FileCheck %s +# REQUIRES: asserts + +# (G_*ADDO x, 0) -> x + no carry + +... +--- +name: uadd_zero +tracksRegLiveness: true +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: uadd_zero + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-NEXT: %add:_(s32) = COPY %lhs(s32) + ; CHECK-NEXT: %o:_(s1) = G_CONSTANT i1 false + ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %lhs:_(s32) = COPY $w0 + %zero:_(s32) = G_CONSTANT i32 0 + %add:_(s32), %o:_(s1) = G_UADDO %lhs, %zero + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %add(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: sadd_zero +tracksRegLiveness: true +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: sadd_zero + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-NEXT: %add:_(s32) = COPY %lhs(s32) + ; CHECK-NEXT: %o:_(s1) = G_CONSTANT i1 false + ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %lhs:_(s32) = COPY $w0 + %zero:_(s32) = G_CONSTANT i32 0 + %add:_(s32), %o:_(s1) = G_SADDO %lhs, %zero + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %add(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: wrong_cst +tracksRegLiveness: true +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: wrong_cst + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %lhs:_(s32) = COPY $w0 + ; CHECK-NEXT: %not_zero:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %add:_(s32), %o:_(s1) = G_UADDO %lhs, %not_zero + ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1) + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: $w1 = COPY %o_wide(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %lhs:_(s32) = COPY $w0 + %not_zero:_(s32) = G_CONSTANT i32 3 + %add:_(s32), %o:_(s1) = G_UADDO %lhs, %not_zero + %o_wide:_(s32) = G_ZEXT %o(s1) + $w0 = COPY %add(s32) + $w1 = COPY %o_wide + RET_ReallyLR implicit $w0 +... +--- +name: uadd_vec_zero +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0 + ; CHECK-LABEL: name: uadd_vec_zero + ; CHECK: liveins: $q0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %add:_(<2 x s64>) = COPY %lhs(<2 x s64>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK-NEXT: %o:_(<2 x s1>) = G_BUILD_VECTOR [[C]](s1), [[C]](s1) + ; CHECK-NEXT: %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o(<2 x s1>), %zero(s64) + ; CHECK-NEXT: %o_wide:_(s64) = G_ZEXT %o_elt_0(s1) + ; CHECK-NEXT: $q0 = COPY %add(<2 x s64>) + ; CHECK-NEXT: $x0 = COPY %o_wide(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %lhs:_(<2 x s64>) = COPY $q0 + %zero:_(s64) = G_CONSTANT i64 0 + %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero + %add:_(<2 x s64>), %o:_(<2 x s1>) = G_UADDO %lhs, %zero_vec + %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o:_(<2 x s1>), %zero:_(s64) + %o_wide:_(s64) = G_ZEXT %o_elt_0 + $q0 = COPY %add(<2 x s64>) + $x0 = COPY %o_wide + RET_ReallyLR implicit $q0 +... +--- +name: sadd_vec_zero +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0 + ; CHECK-LABEL: name: sadd_vec_zero + ; CHECK: liveins: $q0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %add:_(<2 x s64>) = COPY %lhs(<2 x s64>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK-NEXT: %o:_(<2 x s1>) = G_BUILD_VECTOR [[C]](s1), [[C]](s1) + ; CHECK-NEXT: %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o(<2 x s1>), %zero(s64) + ; CHECK-NEXT: %o_wide:_(s64) = G_ZEXT %o_elt_0(s1) + ; CHECK-NEXT: $q0 = COPY %add(<2 x s64>) + ; CHECK-NEXT: $x0 = COPY %o_wide(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %lhs:_(<2 x s64>) = COPY $q0 + %zero:_(s64) = G_CONSTANT i64 0 + %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero + %add:_(<2 x s64>), %o:_(<2 x s1>) = G_SADDO %lhs, %zero_vec + %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o:_(<2 x s1>), %zero:_(s64) + %o_wide:_(s64) = G_ZEXT %o_elt_0 + $q0 = COPY %add(<2 x s64>) + $x0 = COPY %o_wide + RET_ReallyLR implicit $q0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index c118e757ef539..77c182f1a8ac7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4155,10 +4155,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v2, s[6:7], 0, v0 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4171,10 +4171,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v2, s[6:7], 0, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4187,10 +4187,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[6:7], 0, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4200,13 +4200,13 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) @@ -4226,17 +4226,15 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 +; GFX6-NEXT: s_mov_b32 s3, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_add_u32 s0, s2, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: s_addc_u32 s1, s2, 0x80000000 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 @@ -4254,17 +4252,15 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 +; GFX8-NEXT: s_mov_b32 s3, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_add_u32 s0, s2, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: s_addc_u32 s1, s2, 0x80000000 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 @@ -4282,17 +4278,15 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_add_u32 s0, s2, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: s_addc_u32 s1, s2, 0x80000000 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 @@ -4302,23 +4296,21 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s4, s0, s2 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 ; GFX10-NEXT: s_and_b32 s5, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_addc_u32 s5, s1, s3 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_xor_b32 s3, s1, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s3 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, s2, s1 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s3 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) @@ -4335,10 +4327,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v4, s[2:3], 0, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog ; @@ -4351,10 +4343,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog ; @@ -4367,10 +4359,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3] ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; @@ -4378,13 +4370,13 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] -; GFX10-NEXT: v_add_co_u32 v0, s1, v4, 0 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) @@ -4402,10 +4394,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], 0, v0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog ; @@ -4418,10 +4410,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog ; @@ -4434,10 +4426,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1] ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; @@ -4446,12 +4438,12 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s0, v4, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) @@ -4469,21 +4461,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], 0, v0 -; GFX6-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v6 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_add_i32_e64 v3, s[6:7], 0, v2 -; GFX6-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] +; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i64: @@ -4495,21 +4486,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], 0, v0 -; GFX8-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v6 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], 0, v2 -; GFX8-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i64: @@ -4521,21 +4511,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX9-NEXT: v_add_co_u32_e64 v1, s[6:7], 0, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[8:9], v0, v10, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[6:7], 0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[6:7], v2, v10, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i64: @@ -4546,21 +4535,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 +; GFX10-NEXT: v_cmp_gt_i64_e64 s7, 0, v[6:7] +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0x80000000, v0, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, v[10:11], v[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s5, 0x80000000, v4, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo -; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, s7, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) @@ -4580,20 +4568,18 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_add_u32 s0, s4, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 ; GFX6-NEXT: s_addc_u32 s1, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -4603,17 +4589,15 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_add_u32 s0, s4, 0 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v4 ; GFX6-NEXT: v_readfirstlane_b32 s1, v2 @@ -4633,20 +4617,18 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 +; GFX8-NEXT: s_mov_b32 s10, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_add_u32 s0, s4, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s10, 0 ; GFX8-NEXT: s_addc_u32 s1, s4, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -4656,17 +4638,15 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 +; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 ; GFX8-NEXT: v_readfirstlane_b32 s1, v2 @@ -4686,20 +4666,18 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 +; GFX9-NEXT: s_mov_b32 s10, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_add_u32 s0, s4, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 ; GFX9-NEXT: s_addc_u32 s1, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -4709,17 +4687,15 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_add_u32 s0, s4, 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v4 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2 @@ -4733,39 +4709,34 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 ; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_mov_b32 s11, 0 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 -; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: s_addc_u32 s9, s1, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] -; GFX10-NEXT: s_ashr_i32 s1, s9, 31 +; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_xor_b32 s8, s4, s0 -; GFX10-NEXT: s_add_u32 s0, s1, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_xor_b32 s8, s4, s1 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, s10 ; GFX10-NEXT: s_add_u32 s4, s2, s6 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 -; GFX10-NEXT: s_and_b32 s5, s5, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s5, s5, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_addc_u32 s5, s3, s7 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 -; GFX10-NEXT: s_ashr_i32 s1, s5, 31 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_add_u32 s0, s1, 0 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: s_addc_u32 s1, s0, s10 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -4803,24 +4774,22 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: s_ashr_i32 s3, s9, 31 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s3, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_ashr_i32 s0, s9, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_addc_u32 s1, s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s2, s3, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s6, s6, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 @@ -4873,24 +4842,22 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX8-NEXT: s_ashr_i32 s3, s9, 31 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s3, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_ashr_i32 s0, s9, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_addc_u32 s1, s0, 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s2, s3, 0 +; GFX8-NEXT: s_addc_u32 s2, s0, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s6, s6, 1 -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 +; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -4943,24 +4910,22 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX9-NEXT: s_ashr_i32 s3, s9, 31 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s3, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s9, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_addc_u32 s1, s0, 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s2, s3, 0 +; GFX9-NEXT: s_addc_u32 s2, s0, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s6, s6, 1 -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 @@ -4984,54 +4949,52 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s4, s0, s4 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s10, s[6:7], 0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s5, s1, s5 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1] ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s8, s2, s6 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_addc_u32 s9, s3, s7 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[2:3] -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[8:9], s[2:3] +; GFX10-NEXT: s_cselect_b32 s10, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s9 -; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s10 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_ashr_i32 s3, s9, 31 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 -; GFX10-NEXT: s_add_u32 s0, s3, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0 +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-NEXT: s_addc_u32 s2, s3, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo @@ -5056,7 +5019,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc ; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -5067,17 +5029,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc +; GFX6-NEXT: s_mov_b64 vcc, 0 ; GFX6-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v3 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc +; GFX6-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i128_sv: @@ -5090,7 +5053,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -5101,17 +5063,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc +; GFX8-NEXT: s_mov_b64 vcc, 0 ; GFX8-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v3 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc +; GFX8-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_i128_sv: @@ -5124,7 +5087,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -5135,17 +5097,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc +; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX9-NEXT: v_bfrev_b32_e32 v7, 1 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i128_sv: @@ -5165,17 +5128,17 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v3, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0x80000000, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5196,25 +5159,25 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 +; GFX6-NEXT: s_mov_b64 vcc, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i128_vs: @@ -5241,17 +5204,17 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: s_mov_b64 vcc, 0 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc +; GFX8-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_i128_vs: @@ -5278,17 +5241,17 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: s_mov_b64 vcc, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v1, vcc +; GFX9-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i128_vs: @@ -5310,18 +5273,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v7 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v8, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5349,42 +5312,41 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v1, v18, vcc +; GFX6-NEXT: s_mov_b64 vcc, 0 +; GFX6-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] +; GFX6-NEXT: v_addc_u32_e64 v10, s[4:5], v1, v18, s[4:5] ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v12 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc -; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, v1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v9, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, v3, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v17, v10, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v8, s[4:5], v4, v12 +; GFX6-NEXT: v_addc_u32_e64 v9, s[4:5], v5, v13, s[4:5] +; GFX6-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v14, s[4:5] +; GFX6-NEXT: v_addc_u32_e64 v11, s[4:5], v7, v15, s[4:5] +; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v5 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v13, vcc, v5, v18, vcc +; GFX6-NEXT: v_addc_u32_e32 v12, vcc, v5, v18, vcc ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i128: @@ -5407,42 +5369,41 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v1, v18, vcc +; GFX8-NEXT: s_mov_b64 vcc, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v10, s[4:5], v1, v18, s[4:5] ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v12 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, v3, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v10, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v4, v12 +; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v5, v13, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v14, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v11, s[4:5], v7, v15, s[4:5] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v5, v18, vcc +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v5, v18, vcc ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i128: @@ -5465,42 +5426,41 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v18, vcc +; GFX9-NEXT: s_mov_b64 vcc, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[4:5], 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, v1, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], v1, v18, s[4:5] ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v13, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v6, v14, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v15, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v16, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v10, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], v4, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v5, v13, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], v6, v14, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], v7, v15, s[4:5] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v5, v18, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v5, v18, vcc ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v2i128: @@ -5525,40 +5485,39 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[10:11], v[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s5, v[10:11], v[4:5] ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[12:13], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[6:7] +; GFX10-NEXT: v_cmp_eq_u64_e64 s6, v[12:13], v[6:7] +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[12:13], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, 0, s5 -; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, v5, s4 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v7, s5 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v13 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_cmp_gt_i64_e64 s5, 0, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s6 +; GFX10-NEXT: v_cmp_eq_u64_e64 s6, 0, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s4, 0, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, 0x80000000, v1, s4 +; GFX10-NEXT: v_xor_b32_e32 v3, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v2, s5 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, v4, s5 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v9, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -5592,23 +5551,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_ashr_i32 s3, s17, 31 +; GFX6-NEXT: s_brev_b32 s10, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s3, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_ashr_i32 s0, s17, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_addc_u32 s1, s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s2, s3, 0 -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_and_b32 s11, s11, 1 -; GFX6-NEXT: s_brev_b32 s10, 1 -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 +; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_addc_u32 s3, s3, s10 +; GFX6-NEXT: s_addc_u32 s3, s0, s10 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_add_u32 s0, s4, s12 @@ -5649,24 +5606,22 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_ashr_i32 s7, s3, 31 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] -; GFX6-NEXT: s_add_u32 s4, s7, 0 -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_and_b32 s5, s5, 1 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: s_addc_u32 s5, s7, 0 +; GFX6-NEXT: s_addc_u32 s5, s4, 0 ; GFX6-NEXT: s_cselect_b32 s6, 1, 0 ; GFX6-NEXT: s_and_b32 s6, s6, 1 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_addc_u32 s6, s7, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: s_addc_u32 s6, s4, 0 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s8, s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s8, 0 +; GFX6-NEXT: s_and_b32 s7, s7, 1 +; GFX6-NEXT: s_cmp_lg_u32 s7, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s7, s10 +; GFX6-NEXT: s_addc_u32 s7, s4, s10 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5723,23 +5678,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX8-NEXT: s_ashr_i32 s3, s17, 31 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s3, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_ashr_i32 s0, s17, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_addc_u32 s1, s0, 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s2, s3, 0 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_and_b32 s11, s11, 1 +; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: s_brev_b32 s10, 1 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_addc_u32 s3, s3, s10 +; GFX8-NEXT: s_addc_u32 s3, s0, s10 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_add_u32 s0, s4, s12 @@ -5786,24 +5739,22 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX8-NEXT: s_ashr_i32 s7, s3, 31 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] -; GFX8-NEXT: s_add_u32 s4, s7, 0 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s5, s5, 1 +; GFX8-NEXT: s_ashr_i32 s4, s3, 31 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 -; GFX8-NEXT: s_addc_u32 s5, s7, 0 +; GFX8-NEXT: s_addc_u32 s5, s4, 0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: s_and_b32 s6, s6, 1 ; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_addc_u32 s6, s7, 0 +; GFX8-NEXT: s_addc_u32 s6, s4, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s8, 0 +; GFX8-NEXT: s_and_b32 s7, s7, 1 +; GFX8-NEXT: s_cmp_lg_u32 s7, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s7, s10 +; GFX8-NEXT: s_addc_u32 s7, s4, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5860,23 +5811,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; GFX9-NEXT: s_ashr_i32 s3, s17, 31 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s3, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s17, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_addc_u32 s1, s0, 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s2, s3, 0 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_and_b32 s11, s11, 1 +; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: s_brev_b32 s10, 1 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_addc_u32 s3, s3, s10 +; GFX9-NEXT: s_addc_u32 s3, s0, s10 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_add_u32 s0, s4, s12 @@ -5923,24 +5872,22 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 -; GFX9-NEXT: s_ashr_i32 s7, s3, 31 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] -; GFX9-NEXT: s_add_u32 s4, s7, 0 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s5, s5, 1 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_cmp_lg_u32 s5, 0 -; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_addc_u32 s5, s4, 0 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 ; GFX9-NEXT: s_and_b32 s6, s6, 1 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_addc_u32 s6, s7, 0 +; GFX9-NEXT: s_addc_u32 s6, s4, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s8, 0 +; GFX9-NEXT: s_and_b32 s7, s7, 1 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s7, s10 +; GFX9-NEXT: s_addc_u32 s7, s4, s10 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -5974,9 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] ; GFX10-NEXT: s_and_b32 s16, s16, 1 -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], 0 ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_addc_u32 s16, s2, s10 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -5985,80 +5930,78 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_addc_u32 s17, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s17 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s18 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: s_ashr_i32 s3, s17, 31 -; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 -; GFX10-NEXT: s_add_u32 s0, s3, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX10-NEXT: s_ashr_i32 s0, s17, 31 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-NEXT: s_addc_u32 s2, s3, 0 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_b32 s11, s11, 1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: s_addc_u32 s3, s3, s10 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_add_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s16 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s16 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[14:15], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[14:15], 0 ; GFX10-NEXT: s_addc_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 ; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 ; GFX10-NEXT: s_addc_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: s_ashr_i32 s5, s9, 31 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: s_and_b32 s3, 1, s2 -; GFX10-NEXT: s_add_u32 s2, s5, 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s3, 0, s3 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s3 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_and_b32 s3, 1, s3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, s3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, 0, s2 +; GFX10-NEXT: s_ashr_i32 s2, s9, 31 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 +; GFX10-NEXT: s_addc_u32 s3, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX10-NEXT: s_and_b32 s4, s4, 1 @@ -6066,12 +6009,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: s_addc_u32 s4, s5, 0 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: s_addc_u32 s4, s2, 0 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_and_b32 s5, s5, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s1, s5, s10 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: s_addc_u32 s1, s2, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 1f8a322dbdd12..dbd4eab951828 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4141,10 +4141,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v2, s[6:7], 0, v0 +; GFX6-NEXT: s_mov_b64 s[6:7], 0 ; GFX6-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4157,10 +4157,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v2, s[6:7], 0, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], 0 ; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4173,10 +4173,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[6:7], 0, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4186,13 +4186,13 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4212,17 +4212,15 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 +; GFX6-NEXT: s_mov_b32 s3, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_add_u32 s0, s2, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: s_addc_u32 s1, s2, 0x80000000 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v1 @@ -4240,17 +4238,15 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 +; GFX8-NEXT: s_mov_b32 s3, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_add_u32 s0, s2, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: s_addc_u32 s1, s2, 0x80000000 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 @@ -4268,17 +4264,15 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_add_u32 s0, s2, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: s_addc_u32 s1, s2, 0x80000000 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 @@ -4288,23 +4282,21 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sub_u32 s4, s0, s2 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 ; GFX10-NEXT: s_and_b32 s5, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_subb_u32 s5, s1, s3 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_xor_b32 s3, s1, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s3 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, s2, s1 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s3 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4321,10 +4313,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v4, s[2:3], 0, v0 +; GFX6-NEXT: s_mov_b64 s[2:3], 0 ; GFX6-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog ; @@ -4337,10 +4329,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], 0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 ; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog ; @@ -4353,10 +4345,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], 0, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3] ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; @@ -4364,13 +4356,13 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[0:1] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] -; GFX10-NEXT: v_add_co_u32 v0, s1, v4, 0 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4388,10 +4380,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], 0, v0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog ; @@ -4404,10 +4396,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], 0, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], 0 ; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog ; @@ -4420,10 +4412,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], 0, v0 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1] ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; @@ -4432,12 +4424,12 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s0, v4, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 ; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4455,21 +4447,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], 0, v0 -; GFX6-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX6-NEXT: s_mov_b64 s[6:7], 0 +; GFX6-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX6-NEXT: v_add_i32_e64 v3, s[6:7], 0, v2 -; GFX6-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] +; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i64: @@ -4481,21 +4472,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], 0, v0 -; GFX8-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], 0, v2 -; GFX8-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7] ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i64: @@ -4507,21 +4497,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 -; GFX9-NEXT: v_add_co_u32_e64 v1, s[6:7], 0, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[8:9], v0, v10, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[6:7], 0, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[6:7], v2, v10, s[6:7] ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i64: @@ -4532,21 +4521,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s7, 0, v[6:7] +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0x80000000, v0, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, v[10:11], v[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s5, 0x80000000, v4, s5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo -; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo +; GFX10-NEXT: s_xor_b32 vcc_lo, s7, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) @@ -4566,20 +4554,18 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_add_u32 s0, s4, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 ; GFX6-NEXT: s_addc_u32 s1, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -4589,17 +4575,15 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_add_u32 s0, s4, 0 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 -; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v4 ; GFX6-NEXT: v_readfirstlane_b32 s1, v2 @@ -4619,20 +4603,18 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 +; GFX8-NEXT: s_mov_b32 s10, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_add_u32 s0, s4, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s10, 0 ; GFX8-NEXT: s_addc_u32 s1, s4, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -4642,17 +4624,15 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 +; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_add_u32 s0, s4, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_cmp_lg_u32 s2, 0 +; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 ; GFX8-NEXT: v_readfirstlane_b32 s1, v2 @@ -4672,20 +4652,18 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 +; GFX9-NEXT: s_mov_b32 s10, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_add_u32 s0, s4, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 ; GFX9-NEXT: s_addc_u32 s1, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -4695,17 +4673,15 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_add_u32 s0, s4, 0 -; GFX9-NEXT: s_cselect_b32 s2, 1, 0 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v4 ; GFX9-NEXT: v_readfirstlane_b32 s1, v2 @@ -4719,39 +4695,34 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 ; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_mov_b32 s11, 0 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 -; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] -; GFX10-NEXT: s_ashr_i32 s1, s9, 31 +; GFX10-NEXT: s_brev_b32 s10, 1 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_xor_b32 s8, s4, s0 -; GFX10-NEXT: s_add_u32 s0, s1, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_xor_b32 s8, s4, s1 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, s10 ; GFX10-NEXT: s_sub_u32 s4, s2, s6 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 -; GFX10-NEXT: s_and_b32 s5, s5, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s5, s5, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_subb_u32 s5, s3, s7 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 -; GFX10-NEXT: s_ashr_i32 s1, s5, 31 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_add_u32 s0, s1, 0 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: s_addc_u32 s1, s0, s10 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -4789,26 +4760,24 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 -; GFX6-NEXT: s_ashr_i32 s3, s11, 31 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s3, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_ashr_i32 s0, s11, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_addc_u32 s1, s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s2, s3, 0 +; GFX6-NEXT: s_addc_u32 s2, s0, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s4, s4, 1 -; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 @@ -4863,24 +4832,22 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_ashr_i32 s3, s11, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_add_u32 s0, s3, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_ashr_i32 s0, s11, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_addc_u32 s1, s0, 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s2, s3, 0 +; GFX8-NEXT: s_addc_u32 s2, s0, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_cmp_lg_u32 s4, 0 +; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 @@ -4935,24 +4902,22 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_ashr_i32 s3, s11, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_add_u32 s0, s3, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s11, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_addc_u32 s1, s0, 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s2, s3, 0 +; GFX9-NEXT: s_addc_u32 s2, s0, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 @@ -4987,45 +4952,43 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s11, s11, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s10 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: s_subb_u32 s11, s3, s7 -; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], s[2:3] +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s11 ; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[4:5], 0 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[4:5], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_ashr_i32 s3, s11, 31 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[6:7], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_add_u32 s0, s3, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_ashr_i32 s0, s11, 31 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: s_addc_u32 s1, s0, 0 ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: s_and_b32 s2, s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_addc_u32 s2, s3, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, s10 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo @@ -5060,20 +5023,20 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX6-NEXT: s_mov_b64 vcc, 0 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_sv: @@ -5096,20 +5059,20 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: s_mov_b64 vcc, 0 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_sv: @@ -5132,20 +5095,20 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: s_mov_b64 vcc, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_sv: @@ -5167,17 +5130,17 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v7 +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v8, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5198,9 +5161,9 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX6-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -5209,16 +5172,16 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: s_mov_b64 vcc, 0 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i128_vs: @@ -5241,23 +5204,23 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX8-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX8-NEXT: s_mov_b64 vcc, 0 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_i128_vs: @@ -5280,23 +5243,23 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 -; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 +; GFX9-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s4 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: s_mov_b64 vcc, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i128_vs: @@ -5320,18 +5283,18 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v7 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v8, s0 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5361,44 +5324,43 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v20, vcc +; GFX6-NEXT: s_mov_b64 vcc, 0 +; GFX6-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc +; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] +; GFX6-NEXT: v_addc_u32_e64 v8, s[4:5], v1, v20, s[4:5] ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12 -; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc -; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc -; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, v1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v17, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v18, v3, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v19, v8, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 +; GFX6-NEXT: v_subb_u32_e64 v9, s[4:5], v5, v13, s[4:5] +; GFX6-NEXT: v_subb_u32_e64 v10, s[4:5], v6, v14, s[4:5] +; GFX6-NEXT: v_subb_u32_e64 v11, s[4:5], v7, v15, s[4:5] +; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; GFX6-NEXT: v_cmp_lt_u64_e64 s[4:5], 0, v[12:13] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[4:5] ; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v5 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v13, vcc, v5, v20, vcc +; GFX6-NEXT: v_addc_u32_e32 v12, vcc, v5, v20, vcc ; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i128: @@ -5423,44 +5385,43 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v20, vcc +; GFX8-NEXT: s_mov_b64 vcc, 0 +; GFX8-NEXT: v_addc_u32_e64 v2, s[4:5], 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v8, s[4:5], v1, v20, s[4:5] ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12 -; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc -; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc -; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v3, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v8, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 +; GFX8-NEXT: v_subb_u32_e64 v9, s[4:5], v5, v13, s[4:5] +; GFX8-NEXT: v_subb_u32_e64 v10, s[4:5], v6, v14, s[4:5] +; GFX8-NEXT: v_subb_u32_e64 v11, s[4:5], v7, v15, s[4:5] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[4:5], 0, v[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[4:5] ; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v5 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v5, v20, vcc +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, v5, v20, vcc ; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i128: @@ -5485,44 +5446,43 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v20, vcc +; GFX9-NEXT: s_mov_b64 vcc, 0 +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[4:5], 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, v1, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e64 v8, s[4:5], v1, v20, s[4:5] ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v9, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v8, s[4:5] +; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v4, v12 +; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v5, v13, s[4:5] +; GFX9-NEXT: v_subb_co_u32_e64 v10, s[4:5], v6, v14, s[4:5] +; GFX9-NEXT: v_subb_co_u32_e64 v11, s[4:5], v7, v15, s[4:5] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[4:5], 0, v[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[4:5] ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v11 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v5, v20, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v5, v20, vcc ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v2i128: @@ -5549,42 +5509,41 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo ; GFX10-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s5, v[8:9], v[4:5] ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[10:11], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] +; GFX10-NEXT: v_cmp_eq_u64_e64 s6, v[10:11], v[6:7] +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v12, s5 -; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v5, s4 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v7, s5 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_cmp_lt_u64_e64 s5, 0, v[12:13] +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, 0, v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s6 +; GFX10-NEXT: v_cmp_eq_u64_e64 s6, 0, v[14:15] +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s4, 0, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v5, s6 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, 0x80000000, v1, s4 +; GFX10-NEXT: v_xor_b32_e32 v3, v3, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0x80000000, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v2, s5 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v4, s5 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v12, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v13, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -5618,25 +5577,23 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 -; GFX6-NEXT: s_ashr_i32 s3, s19, 31 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s3, 0 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: s_ashr_i32 s0, s19, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: s_addc_u32 s1, s3, 0 +; GFX6-NEXT: s_addc_u32 s1, s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s2, s3, 0 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_addc_u32 s2, s0, 0 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 +; GFX6-NEXT: s_and_b32 s3, s3, 1 ; GFX6-NEXT: s_brev_b32 s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_addc_u32 s3, s0, s8 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_sub_u32 s0, s4, s12 @@ -5677,26 +5634,24 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 -; GFX6-NEXT: s_ashr_i32 s7, s3, 31 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GFX6-NEXT: s_add_u32 s4, s7, 0 -; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_and_b32 s5, s5, 1 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: s_addc_u32 s5, s7, 0 +; GFX6-NEXT: s_addc_u32 s5, s4, 0 ; GFX6-NEXT: s_cselect_b32 s6, 1, 0 ; GFX6-NEXT: s_and_b32 s6, s6, 1 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_addc_u32 s6, s7, 0 +; GFX6-NEXT: s_addc_u32 s6, s4, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_and_b32 s7, s7, 1 +; GFX6-NEXT: s_cmp_lg_u32 s7, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s7, s8 +; GFX6-NEXT: s_addc_u32 s7, s4, s8 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5755,23 +5710,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX8-NEXT: s_and_b32 s0, 1, s2 -; GFX8-NEXT: s_ashr_i32 s3, s19, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_add_u32 s0, s3, 0 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_ashr_i32 s0, s19, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_addc_u32 s1, s0, 0 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s2, s3, 0 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 +; GFX8-NEXT: s_addc_u32 s2, s0, 0 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: s_brev_b32 s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_addc_u32 s3, s3, s8 +; GFX8-NEXT: s_addc_u32 s3, s0, s8 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_sub_u32 s0, s4, s12 @@ -5820,24 +5773,22 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: s_ashr_i32 s7, s3, 31 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_add_u32 s4, s7, 0 -; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_and_b32 s5, s5, 1 +; GFX8-NEXT: s_ashr_i32 s4, s3, 31 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 -; GFX8-NEXT: s_addc_u32 s5, s7, 0 +; GFX8-NEXT: s_addc_u32 s5, s4, 0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: s_and_b32 s6, s6, 1 ; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_addc_u32 s6, s7, 0 +; GFX8-NEXT: s_addc_u32 s6, s4, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_and_b32 s7, s7, 1 +; GFX8-NEXT: s_cmp_lg_u32 s7, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s7, s8 +; GFX8-NEXT: s_addc_u32 s7, s4, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5896,23 +5847,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GFX9-NEXT: s_and_b32 s0, 1, s2 -; GFX9-NEXT: s_ashr_i32 s3, s19, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_add_u32 s0, s3, 0 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s19, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_addc_u32 s1, s3, 0 +; GFX9-NEXT: s_addc_u32 s1, s0, 0 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s2, s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 +; GFX9-NEXT: s_addc_u32 s2, s0, 0 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: s_brev_b32 s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_addc_u32 s3, s0, s8 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_sub_u32 s0, s4, s12 @@ -5961,24 +5910,22 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: s_ashr_i32 s7, s3, 31 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_add_u32 s4, s7, 0 -; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_and_b32 s5, s5, 1 +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_cmp_lg_u32 s5, 0 -; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_addc_u32 s5, s4, 0 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 ; GFX9-NEXT: s_and_b32 s6, s6, 1 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_addc_u32 s6, s7, 0 +; GFX9-NEXT: s_addc_u32 s6, s4, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_and_b32 s7, s7, 1 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s7, s8 +; GFX9-NEXT: s_addc_u32 s7, s4, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -6012,7 +5959,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_and_b32 s18, s18, 1 -; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[8:9], 0 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 ; GFX10-NEXT: s_subb_u32 s18, s2, s10 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 @@ -6022,98 +5968,94 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_subb_u32 s19, s3, s11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] ; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s19 +; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 ; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: s_ashr_i32 s3, s19, 31 -; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[10:11], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_add_u32 s0, s3, 0 -; GFX10-NEXT: s_brev_b32 s10, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s17 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_ashr_i32 s0, s19, 31 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: s_brev_b32 s11, 1 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: s_and_b32 s2, s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v1, s16 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s19 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 +; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_addc_u32 s2, s3, 0 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: s_and_b32 s8, s8, 1 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, s11 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_addc_u32 s3, s3, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_sub_u32 s0, s4, s12 -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo -; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s18 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_subb_u32 s1, s5, s13 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] +; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5] ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e64 s4, s[12:13], 0 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: v_cmp_gt_u64_e64 s3, s[12:13], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: s_subb_u32 s8, s6, s14 ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3 ; GFX10-NEXT: s_and_b32 s9, s9, 1 -; GFX10-NEXT: v_mov_b32_e32 v7, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[14:15], 0 ; GFX10-NEXT: s_subb_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7] ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 ; GFX10-NEXT: s_and_b32 s2, 1, s2 ; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 -; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: s_ashr_i32 s5, s9, 31 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 -; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[14:15], 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3 -; GFX10-NEXT: s_and_b32 s3, 1, s2 -; GFX10-NEXT: s_add_u32 s2, s5, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: s_ashr_i32 s2, s9, 31 +; GFX10-NEXT: s_and_b32 s3, 1, s3 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 +; GFX10-NEXT: s_addc_u32 s3, s2, 0 ; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, s9 ; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc_lo ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_addc_u32 s4, s2, 0 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX10-NEXT: s_and_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s5, s5, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, s8 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: s_addc_u32 s4, s5, 0 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-NEXT: s_and_b32 s6, s6, 1 +; GFX10-NEXT: s_addc_u32 s1, s2, s11 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s1, s5, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo