diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 97f29015c6911..5d9e3f2e64a82 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -531,9 +531,11 @@ class CombinerHelper { /// Check if operand \p OpIdx is undef. bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) const; - /// Check if operand \p OpIdx is known to be a power of 2. - bool matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, - unsigned OpIdx) const; + /// Check if operand \p MO is known to be a power of 2. When \p OrNegative + /// is true, also match operands whose negation is a power of 2 (i.e. whose + /// absolute value is a power of 2). + bool matchOperandIsKnownToBeAPowerOfTwo(const MachineOperand &MO, + bool OrNegative = false) const; /// Erase \p MI void eraseInst(MachineInstr &MI) const; @@ -752,6 +754,9 @@ class CombinerHelper { /// return expressions that implements it by shifting. void applyUDivByPow2(MachineInstr &MI) const; + /// Combine G_SREM x, (+/-2^k) to a bias-and-mask sequence. + void applySimplifySRemByPow2(MachineInstr &MI) const; + // G_UMULH x, (1 << c)) -> x >> (bitwidth - c) bool matchUMulHToLShr(MachineInstr &MI) const; void applyUMulHToLShr(MachineInstr &MI) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index a861a60ecfcd3..24b82b4c9d2f0 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -333,10 +333,13 @@ ConstantFoldICmp(unsigned Pred, const Register Op1, const Register Op2, /// Test if the given value is known to have exactly one bit set. This differs /// from computeKnownBits in that it doesn't necessarily determine which bit is -/// set. +/// set. When \p OrNegative is true, the value is also considered a power of two +/// if its negation is a power of two (i.e. its absolute value is a power of +/// two). LLVM_ABI bool isKnownToBeAPowerOfTwo(Register Val, const MachineRegisterInfo &MRI, - GISelValueTracking *ValueTracking = nullptr); + GISelValueTracking *ValueTracking = nullptr, + bool OrNegative = false); LLVM_ABI Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 6370197d044e7..1288295cbe26e 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -659,8 +659,8 @@ def binop_with_neg : GICombineRule< def urem_pow2_to_mask : GICombineRule< (defs root:$root), - (match (wip_match_opcode G_UREM):$root, - [{ return Helper.matchOperandIsKnownToBeAPowerOfTwo(*${root}, 2); }]), + (match (G_UREM $dst, $x, $y):$root, + [{ return Helper.matchOperandIsKnownToBeAPowerOfTwo(${y}); }]), (apply [{ Helper.applySimplifyURemByPow2(*${root}); }]) >; @@ -1338,7 +1338,15 @@ def srem_by_const : GICombineRule< [{ return Helper.matchSDivOrSRemByConst(*${root}); }]), (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>; -def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>; +def srem_pow2_to_mask : GICombineRule< + (defs root:$root), + (match (G_SREM $dst, $x, $y):$root, + [{ return Helper.matchOperandIsKnownToBeAPowerOfTwo(${y}, + /*OrNegative=*/true); }]), + (apply [{ Helper.applySimplifySRemByPow2(*${root}); }])>; + +def intrem_combines : GICombineGroup<[srem_pow2_to_mask, urem_by_const, + srem_by_const]>; def reassoc_ptradd : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 95b7f864c16fd..14cc98ee1fa79 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -3060,10 +3060,9 @@ bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI); } -bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, - unsigned OpIdx) const { - MachineOperand &MO = MI.getOperand(OpIdx); - return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, VT); +bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo( + const MachineOperand &MO, bool OrNegative) const { + return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, VT, OrNegative); } void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, @@ -6103,6 +6102,43 @@ void CombinerHelper::applyUDivByPow2(MachineInstr &MI) const { MI.eraseFromParent(); } +void CombinerHelper::applySimplifySRemByPow2(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_SREM && "Expected SREM"); + auto &SRem = cast(MI); + Register Dst = SRem.getReg(0); + Register LHS = SRem.getReg(1); + Register RHS = SRem.getReg(2); + LLT Ty = MRI.getType(Dst); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + + // Effectively we want to lower G_SREM %lhs, %rhs, where %rhs is +/- a power + // of 2, to the following branch-free bias-and-mask version: + // + // %abs = G_ABS %rhs + // %mask = G_SUB %abs, 1 + // %sign = G_ASHR %lhs, $(bitwidth - 1) + // %bias = G_AND %sign, %mask + // %biased = G_ADD %lhs, %bias + // %masked = G_AND %biased, %mask + // %res = G_SUB %masked, %bias + // + // The bias adds (|%rhs| - 1) for negative %lhs, correcting rounding towards + // zero (instead of towards -inf that a plain mask would give). Constant + // divisors collapse %mask to a single G_CONSTANT via the CSEMIRBuilder folds + // for G_ABS and G_SUB. + + unsigned BitWidth = Ty.getScalarSizeInBits(); + auto AbsRHS = Builder.buildAbs(Ty, RHS); + auto Mask = Builder.buildSub(Ty, AbsRHS, Builder.buildConstant(Ty, 1)); + auto BWMinusOne = Builder.buildConstant(ShiftAmtTy, BitWidth - 1); + auto Sign = Builder.buildAShr(Ty, LHS, BWMinusOne); + auto Bias = Builder.buildAnd(Ty, Sign, Mask); + auto Biased = Builder.buildAdd(Ty, LHS, Bias); + auto Masked = Builder.buildAnd(Ty, Biased, Mask); + Builder.buildSub(Dst, Masked, Bias); + MI.eraseFromParent(); +} + bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) const { assert(MI.getOpcode() == TargetOpcode::G_UMULH); Register RHS = MI.getOperand(2).getReg(); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 37af0716df9ec..121a73ba55165 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1068,7 +1068,7 @@ llvm::ConstantFoldICmp(unsigned Pred, const Register Op1, const Register Op2, } bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, - GISelValueTracking *VT) { + GISelValueTracking *VT, bool OrNegative) { std::optional DefSrcReg = getDefSrcRegIgnoringCopies(Reg, MRI); if (!DefSrcReg) @@ -1077,11 +1077,15 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, const MachineInstr &MI = *DefSrcReg->MI; const LLT Ty = MRI.getType(Reg); + auto IsPow2 = [OrNegative](const APInt &V) { + return V.isPowerOf2() || (OrNegative && V.isNegatedPowerOf2()); + }; + switch (MI.getOpcode()) { case TargetOpcode::G_CONSTANT: { unsigned BitWidth = Ty.getScalarSizeInBits(); const ConstantInt *CI = MI.getOperand(1).getCImm(); - return CI->getValue().zextOrTrunc(BitWidth).isPowerOf2(); + return IsPow2(CI->getValue().zextOrTrunc(BitWidth)); } case TargetOpcode::G_SHL: { // A left-shift of a constant one will have exactly one bit set because @@ -1107,7 +1111,7 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, // TODO: Probably should have a recursion depth guard since you could have // bitcasted vector elements. for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) - if (!isKnownToBeAPowerOfTwo(MO.getReg(), MRI, VT)) + if (!isKnownToBeAPowerOfTwo(MO.getReg(), MRI, VT, OrNegative)) return false; return true; @@ -1118,7 +1122,7 @@ bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, const unsigned BitWidth = Ty.getScalarSizeInBits(); for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) { auto Const = getIConstantVRegVal(MO.getReg(), MRI); - if (!Const || !Const->zextOrTrunc(BitWidth).isPowerOf2()) + if (!Const || !IsPow2(Const->zextOrTrunc(BitWidth))) return false; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir new file mode 100644 index 0000000000000..c22a421ada922 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-srem-by-pow2.mir @@ -0,0 +1,281 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: srem_pos_pow2_s32 +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: srem_pos_pow2_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]] + ; CHECK-NEXT: $w0 = COPY [[SUB]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 4 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: srem_neg_pow2_s32 +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: srem_neg_pow2_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]] + ; CHECK-NEXT: $w0 = COPY [[SUB]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 -4 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: srem_one_s32 +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: srem_one_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: srem_minus_one_s32 +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: srem_minus_one_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: $w0 = COPY [[C]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 -1 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: srem_intmin_s32 +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: srem_intmin_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C1]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[AND]] + ; CHECK-NEXT: $w0 = COPY [[SUB]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 -2147483648 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: srem_pow2_s64 +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: srem_pow2_s64 + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 7 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C1]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ASHR]], [[C]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND1]], [[AND]] + ; CHECK-NEXT: $x0 = COPY [[SUB]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 8 + %2:_(s64) = G_SREM %0, %1 + $x0 = COPY %2(s64) + RET_ReallyLR implicit $x0 + +... +--- +name: srem_pow2_v4s32_splat +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: srem_pow2_v4s32_splat + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = G_ASHR [[COPY]], [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[ASHR]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[COPY]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[ADD]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[AND1]], [[AND]] + ; CHECK-NEXT: $q0 = COPY [[SUB]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 4 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_SREM %0, %2 + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: srem_pow2_v4s32_mixed_sign +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: srem_pow2_v4s32_mixed_sign + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C4]](s32), [[C4]](s32), [[C4]](s32), [[C4]](s32) + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = G_ASHR [[COPY]], [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[ASHR]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[COPY]], [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[ADD]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[AND1]], [[AND]] + ; CHECK-NEXT: $q0 = COPY [[SUB]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %c0:_(s32) = G_CONSTANT i32 4 + %c1:_(s32) = G_CONSTANT i32 -8 + %c2:_(s32) = G_CONSTANT i32 16 + %c3:_(s32) = G_CONSTANT i32 -1 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c0, %c1, %c2, %c3 + %2:_(<4 x s32>) = G_SREM %0, %1 + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +# Negative test: non-pow2 divisor should not match. +name: srem_nonpow2_s32 +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: srem_nonpow2_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1431655766 + ; CHECK-NEXT: [[SMULH:%[0-9]+]]:_(s32) = G_SMULH [[COPY]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SMULH]], [[C2]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SMULH]], [[LSHR]] + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[ADD]], [[C]] + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[MUL]] + ; CHECK-NEXT: $w0 = COPY [[SUB]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 3 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +# Negative test: non-constant divisor should not match. +name: srem_nonconst_s32 +body: | + bb.1: + liveins: $w0, $w1 + + ; CHECK-LABEL: name: srem_nonconst_s32 + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[COPY1]] + ; CHECK-NEXT: $w0 = COPY [[SREM]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = COPY $w1 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +# Negative test: zero divisor should not match. +name: srem_zero_s32 +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: srem_zero_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[C]] + ; CHECK-NEXT: $w0 = COPY [[SREM]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s32) = G_SREM %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/srem-vec-crash.ll b/llvm/test/CodeGen/AArch64/srem-vec-crash.ll index 0b1e430e21105..4a29c1dd17d2a 100644 --- a/llvm/test/CodeGen/AArch64/srem-vec-crash.ll +++ b/llvm/test/CodeGen/AArch64/srem-vec-crash.ll @@ -3,21 +3,10 @@ ; RUN: llc -mtriple=aarch64-unknown-unknown -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i32 @pr84830(i1 %arg) { -; CHECK-SD-LABEL: pr84830: -; CHECK-SD: // %bb.0: // %bb -; CHECK-SD-NEXT: mov w0, #1 // =0x1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: pr84830: -; CHECK-GI: // %bb.0: // %bb -; CHECK-GI-NEXT: mov w8, #1 // =0x1 -; CHECK-GI-NEXT: sbfx w9, w0, #0, #1 -; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 -; CHECK-GI-NEXT: sdiv w10, w9, w8 -; CHECK-GI-NEXT: msub w8, w10, w8, w9 -; CHECK-GI-NEXT: eor w8, w8, #0x1 -; CHECK-GI-NEXT: and w0, w8, #0x1 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: pr84830: +; CHECK: // %bb.0: // %bb +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret bb: %new0 = srem i1 %arg, true %last = zext i1 %new0 to i32 @@ -38,3 +27,6 @@ bb: %i1 = select i1 %i, i32 0, i32 1 ret i32 %i1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-GI: {{.*}} +; CHECK-SD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 1eb8457cd4a5d..59578e01e9611 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -254,13 +254,10 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_srem_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, 0x80000001 -; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 11, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 12, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0xfff, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 4096 @@ -268,67 +265,20 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) { } define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { -; GISEL-LABEL: v_srem_v2i32_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: v_mov_b32_e32 v3, 0x1000 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v5, 0xfffff000 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v7 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 12, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xfffff000, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_srem_v2i32_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v2, 0x80000001 -; CGP-NEXT: v_mul_hi_i32 v3, v0, v2 -; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1 -; CGP-NEXT: v_ashrrev_i32_e32 v3, 11, v3 -; CGP-NEXT: v_ashrrev_i32_e32 v2, 11, v2 -; CGP-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v2, 12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_srem_v2i32_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_and_b32_e32 v2, 0xfff, v2 +; CHECK-NEXT: v_and_b32_e32 v3, 0xfff, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0xfff, v1 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, ret <2 x i32> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 970d241b44458..dabef84e2c69e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -971,124 +971,12 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_srem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v3, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 -; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 -; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; CHECK-NEXT: v_and_b32_e32 v1, 0xfff, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 -; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 -; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1098,459 +986,35 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-LABEL: v_srem_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: s_mov_b32 s4, 1 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc -; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: s_mov_b32 s4, 1 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v7, v13 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v15, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v18, v0, vcc -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v13 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 -; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v13 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v7, v0 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], v8, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v15, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; GISEL-NEXT: v_and_b32_e32 v1, 0xfff, v1 +; GISEL-NEXT: v_and_b32_e32 v3, 0xfff, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v12, v12, v9 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v9 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xfff, v2 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] -; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 -; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 -; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] -; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v13 -; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v18, v19, v1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v8, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v13 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 -; CGP-NEXT: v_cndmask_b32_e32 v11, v16, v18, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 -; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; CGP-NEXT: v_and_b32_e32 v1, 0xfff, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_and_b32_e32 v3, 0xfff, v3 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_xor_b32_e32 v11, v11, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v11, v12, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; CGP-NEXT: v_and_b32_e32 v2, 0xfff, v2 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result