diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index b575d76e897d2..7e11b863a2869 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1466,6 +1466,319 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, return false; } +/// Match high part of long multiplication. +/// +/// Considering a multiply made up of high and low parts, we can split the +/// multiply into: +/// x * y == (xh*T + xl) * (yh*T + yl) +/// where xh == x>>32 and xl == x & 0xffffffff. T = 2^32. +/// This expands to +/// xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl +/// which can be drawn as +/// [ xh*yh ] +/// [ xh*yl ] +/// [ xl*yh ] +/// [ xl*yl ] +/// We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 + +/// some carrys. The carry makes this difficult and there are multiple ways of +/// representing it. The ones we attempt to support here are: +/// Carry: xh*yh + carry + lowsum +/// carry = lowsum < xh*yl ? 0x1000000 : 0 +/// lowsum = xh*yl + xl*yh + (xl*yl>>32) +/// Ladder: xh*yh + c2>>32 + c3>>32 +/// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh +/// or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xl*yh +/// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32 +/// crosssum = xh*yl + xl*yh +/// carry = crosssum < xh*yl ? 0x1000000 : 0 +/// Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32; +/// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff +/// +/// They all start by matching xh*yh + 2 or 3 other operands. The bottom of the +/// tree is xh*yh, xh*yl, xl*yh and xl*yl. +static bool foldMulHigh(Instruction &I) { + Type *Ty = I.getType(); + if (!Ty->isIntOrIntVectorTy()) + return false; + + unsigned BW = Ty->getScalarSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(BW, BW / 2); + if (BW % 2 != 0) + return false; + + auto CreateMulHigh = [&](Value *X, Value *Y) { + IRBuilder<> Builder(&I); + Type *NTy = Ty->getWithNewBitWidth(BW * 2); + Value *XExt = Builder.CreateZExt(X, NTy); + Value *YExt = Builder.CreateZExt(Y, NTy); + Value *Mul = Builder.CreateMul(XExt, YExt, "", true); + Value *High = Builder.CreateLShr(Mul, BW); + Value *Res = Builder.CreateTrunc(High, Ty, "", true); + Res->takeName(&I); + I.replaceAllUsesWith(Res); + LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and " + << *Y << "\n"); + return true; + }; + + // Common check routines for X_lo*Y_lo and X_hi*Y_lo + auto CheckLoLo = [&](Value *XlYl, Value *X, Value *Y) { + return match(XlYl, m_c_Mul(m_And(m_Specific(X), m_SpecificInt(LowMask)), + m_And(m_Specific(Y), m_SpecificInt(LowMask)))); + }; + auto CheckHiLo = [&](Value *XhYl, Value *X, Value *Y) { + return match(XhYl, m_c_Mul(m_LShr(m_Specific(X), m_SpecificInt(BW / 2)), + m_And(m_Specific(Y), m_SpecificInt(LowMask)))); + }; + + auto FoldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry, + Instruction *B) { + // Looking for LowSum >> 32 and carry (select) + if (Carry->getOpcode() != Instruction::Select) + std::swap(Carry, B); + + // Carry = LowSum < XhYl ? 0x100000000 : 0 + Value *LowSum, *XhYl; + if (!match(Carry, + m_OneUse(m_Select( + m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(LowSum), + m_Value(XhYl))), + m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero())))) + return false; + + // XhYl can be Xh*Yl or Xl*Yh + if (!CheckHiLo(XhYl, X, Y)) { + if (CheckHiLo(XhYl, Y, X)) + std::swap(X, Y); + else + return false; + } + if (XhYl->hasNUsesOrMore(3)) + return false; + + // B = LowSum >> 32 + if (!match(B, + m_OneUse(m_LShr(m_Specific(LowSum), m_SpecificInt(BW / 2)))) || + LowSum->hasNUsesOrMore(3)) + return false; + + // LowSum = XhYl + XlYh + XlYl>>32 + Value *XlYh, *XlYl; + auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)); + if (!match(LowSum, + m_c_Add(m_Specific(XhYl), + m_OneUse(m_c_Add(m_OneUse(m_Value(XlYh)), XlYlHi)))) && + !match(LowSum, m_c_Add(m_OneUse(m_Value(XlYh)), + m_OneUse(m_c_Add(m_Specific(XhYl), XlYlHi)))) && + !match(LowSum, + m_c_Add(XlYlHi, m_OneUse(m_c_Add(m_Specific(XhYl), + m_OneUse(m_Value(XlYh))))))) + return false; + + // Check XlYl and XlYh + if (!CheckLoLo(XlYl, X, Y)) + return false; + if (!CheckHiLo(XlYh, Y, X)) + return false; + + return CreateMulHigh(X, Y); + }; + + auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A, + Instruction *B) { + // xh*yh + c2>>32 + c3>>32 + // c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh + // or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xh*yl + Value *XlYh, *XhYl, *XlYl, *C2, *C3; + // Strip off the two expected shifts. + if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BW / 2))) || + !match(B, m_LShr(m_Value(C3), m_SpecificInt(BW / 2)))) + return false; + + if (match(C3, m_c_Add(m_Add(m_Value(), m_Value()), m_Value()))) + std::swap(C2, C3); + // Try to match c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32) + if (match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)), + m_Value(XlYh)), + m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)))) || + match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)), + m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))), + m_Value(XlYh))) || + match(C2, m_c_Add(m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), + m_Value(XlYh)), + m_And(m_Specific(C3), m_SpecificInt(LowMask))))) { + XhYl = C3; + } else { + // Match c3 = c2&0xffffffff + xl*yh + if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)), + m_Value(XlYh)))) + std::swap(C2, C3); + if (!match(C3, m_c_Add(m_OneUse( + m_And(m_Specific(C2), m_SpecificInt(LowMask))), + m_Value(XlYh))) || + !C3->hasOneUse() || C2->hasNUsesOrMore(3)) + return false; + + // Match c2 = xh*yl + (xl*yl >> 32) + if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), + m_Value(XhYl)))) + return false; + } + + // Match XhYl and XlYh - they can appear either way around. + if (!CheckHiLo(XlYh, Y, X)) + std::swap(XlYh, XhYl); + if (!CheckHiLo(XlYh, Y, X)) + return false; + if (!CheckHiLo(XhYl, X, Y)) + return false; + if (!CheckLoLo(XlYl, X, Y)) + return false; + + return CreateMulHigh(X, Y); + }; + + auto FoldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A, + Instruction *B, Instruction *C) { + /// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32; + /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff + + // Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32. + auto ShiftAdd = m_LShr(m_Add(m_Value(), m_Value()), m_SpecificInt(BW / 2)); + if (!match(A, ShiftAdd)) + std::swap(A, B); + if (!match(A, ShiftAdd)) + std::swap(A, C); + Value *Low; + if (!match(A, m_LShr(m_OneUse(m_Value(Low)), m_SpecificInt(BW / 2)))) + return false; + + // Match B == XhYl>>32 and C == XlYh>>32 + Value *XhYl, *XlYh; + if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BW / 2))) || + !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BW / 2)))) + return false; + if (!CheckHiLo(XhYl, X, Y)) + std::swap(XhYl, XlYh); + if (!CheckHiLo(XhYl, X, Y) || XhYl->hasNUsesOrMore(3)) + return false; + if (!CheckHiLo(XlYh, Y, X) || XlYh->hasNUsesOrMore(3)) + return false; + + // Match Low as XlYl>>32 + XhYl&0xffffffff + XlYh&0xffffffff + Value *XlYl; + if (!match( + Low, + m_c_Add( + m_OneUse(m_c_Add( + m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))), + m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))), + m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))) && + !match( + Low, + m_c_Add( + m_OneUse(m_c_Add( + m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))), + m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))), + m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))) && + !match( + Low, + m_c_Add( + m_OneUse(m_c_Add( + m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))), + m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))), + m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask)))))) + return false; + if (!CheckLoLo(XlYl, X, Y)) + return false; + + return CreateMulHigh(X, Y); + }; + + auto FoldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry, + Instruction *B, Instruction *C) { + // xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32 + // crosssum = xh*yl+xl*yh + // carry = crosssum < xh*yl ? 0x1000000 : 0 + if (Carry->getOpcode() != Instruction::Select) + std::swap(Carry, B); + if (Carry->getOpcode() != Instruction::Select) + std::swap(Carry, C); + + // Carry = CrossSum < XhYl ? 0x100000000 : 0 + Value *CrossSum, *XhYl; + if (!match(Carry, + m_OneUse(m_Select( + m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, + m_Value(CrossSum), m_Value(XhYl))), + m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero())))) + return false; + + if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) + std::swap(B, C); + if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) + return false; + + Value *XlYl, *LowAccum; + if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BW / 2))) || + !match(LowAccum, + m_c_Add(m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))), + m_OneUse(m_And(m_Specific(CrossSum), + m_SpecificInt(LowMask))))) || + LowAccum->hasNUsesOrMore(3)) + return false; + if (!CheckLoLo(XlYl, X, Y)) + return false; + + if (!CheckHiLo(XhYl, X, Y)) + std::swap(X, Y); + if (!CheckHiLo(XhYl, X, Y)) + return false; + Value *XlYh; + if (!match(CrossSum, m_c_Add(m_Specific(XhYl), m_OneUse(m_Value(XlYh)))) || + !CheckHiLo(XlYh, Y, X) || CrossSum->hasNUsesOrMore(4) || + XhYl->hasNUsesOrMore(3)) + return false; + + return CreateMulHigh(X, Y); + }; + + // X and Y are the two inputs, A, B and C are other parts of the pattern + // (crosssum>>32, carry, etc). + Value *X, *Y; + Instruction *A, *B, *C; + auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BW / 2)), + m_LShr(m_Value(Y), m_SpecificInt(BW / 2)))); + if ((match(&I, m_c_Add(HiHi, m_OneUse(m_Add(m_Instruction(A), + m_Instruction(B))))) || + match(&I, m_c_Add(m_Instruction(A), + m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) && + A->hasOneUse() && B->hasOneUse()) + if (FoldMulHighCarry(X, Y, A, B) || FoldMulHighLadder(X, Y, A, B)) + return true; + + if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add( + m_Instruction(A), + m_OneUse(m_Add(m_Instruction(B), + m_Instruction(C))))))) || + match(&I, m_c_Add(m_Instruction(A), + m_OneUse(m_c_Add( + HiHi, m_OneUse(m_Add(m_Instruction(B), + m_Instruction(C))))))) || + match(&I, m_c_Add(m_Instruction(A), + m_OneUse(m_c_Add( + m_Instruction(B), + m_OneUse(m_c_Add(HiHi, m_Instruction(C))))))) || + match(&I, + m_c_Add(m_OneUse(m_c_Add(HiHi, m_Instruction(A))), + m_OneUse(m_Add(m_Instruction(B), m_Instruction(C)))))) && + A->hasOneUse() && B->hasOneUse() && C->hasOneUse()) + return FoldMulHighCarry4(X, Y, A, B, C) || + FoldMulHighLadder4(X, Y, A, B, C); + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -1495,6 +1808,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT, MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT); MadeChange |= foldPatternedLoads(I, DL); MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT); + MadeChange |= foldMulHigh(I); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make // bugs. diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll new file mode 100644 index 0000000000000..b78095cac0df9 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll @@ -0,0 +1,755 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o +define i32 @mul_carry(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o +define i128 @mul_carry_i128(i128 %x, i128 %y) { +; CHECK-LABEL: define i128 @mul_carry_i128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i256 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i256 [[TMP2]], 128 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i256 [[TMP3]] to i128 +; CHECK-NEXT: ret i128 [[ADD11]] +; +entry: + %shr = lshr i128 %x, 64 + %and = and i128 %x, u0xffffffffffffffff + %shr1 = lshr i128 %y, 64 + %and2 = and i128 %y, u0xffffffffffffffff + %mul = mul nuw i128 %shr, %and2 + %mul3 = mul nuw i128 %and, %shr1 + %add = add i128 %mul, %mul3 + %mul4 = mul nuw i128 %and, %and2 + %shr5 = lshr i128 %mul4, 64 + %add6 = add i128 %add, %shr5 + %cmp = icmp ult i128 %add6, %mul + %cond = select i1 %cmp, i128 u0x10000000000000000, i128 0 + %mul8 = mul nuw i128 %shr, %shr1 + %add9 = add nuw i128 %mul8, %cond + %shr10 = lshr i128 %add6, 64 + %add11 = add i128 %add9, %shr10 + ret i128 %add11 +} + +; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o +define <4 x i32> @mul_carry_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i32> @mul_carry_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i32> [[X]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[Y]] to <4 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw <4 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], splat (i64 32) +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw <4 x i64> [[TMP3]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[ADD11]] +; +entry: + %shr = lshr <4 x i32> %x, + %and = and <4 x i32> %x, + %shr1 = lshr <4 x i32> %y, + %and2 = and <4 x i32> %y, + %mul = mul nuw <4 x i32> %shr, %and2 + %mul3 = mul nuw <4 x i32> %and, %shr1 + %add = add <4 x i32> %mul, %mul3 + %mul4 = mul nuw <4 x i32> %and, %and2 + %shr5 = lshr <4 x i32> %mul4, + %add6 = add <4 x i32> %add, %shr5 + %cmp = icmp ult <4 x i32> %add6, %mul + %cond = select <4 x i1> %cmp, <4 x i32> , <4 x i32> zeroinitializer + %mul8 = mul nuw <4 x i32> %shr, %shr1 + %add9 = add nuw <4 x i32> %mul8, %cond + %shr10 = lshr <4 x i32> %add6, + %add11 = add <4 x i32> %add9, %shr10 + ret <4 x i32> %add11 +} + +; Check carry against xlyh, not xhyl +define i32 @mul_carry_xlyh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_xlyh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul3 + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_comm(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_comm( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %and2, %shr + %mul3 = mul nuw i32 %shr1, %and + %add = add i32 %mul3, %mul + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %shr5, %add + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %shr10 = lshr i32 %add6, 16 + %add9 = add nuw i32 %cond, %shr10 + %add11 = add i32 %add9, %mul8 + ret i32 %add11 +} + + +; Negative tests + + +define i32 @mul_carry_notxlo(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notxlo( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 32767 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw nsw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw nsw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 32767 ; wrong mask + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_notyhi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notyhi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 14 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 14 ; wring shift + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_notcarry(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notcarry( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 65536 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 0, i32 65536 ; backwards + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_notlolo(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notlolo( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_nothihi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_nothihi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL4]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul4, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +; Extra uses +define i32 @mul_carry_use_carry(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_carry( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[COND]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %cond) + ret i32 %add11 +} + +define i32 @mul_carry_use_mulhi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mulhi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL8]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul8) + ret i32 %add11 +} + +define i32 @mul_carry_use_llh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_llh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR10]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %shr5) + ret i32 %add11 +} + +define i32 @mul_carry_use_mulll(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mulll( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL4]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul4) + ret i32 %add11 +} + +define i32 @mul_carry_use_mullh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mullh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL3]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul3) + ret i32 %add11 +} + +define i32 @mul_carry_use_mulhl(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mulhl( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul) + ret i32 %add11 +} + +define i32 @mul_carry_use_crosssum(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_crosssum( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[ADD9:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[SHR10:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD11]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[ADD9]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD10:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[ADD10]], [[SHR11]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD11]]) +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %add) + ret i32 %add11 +} + +define i32 @mul_carry_use_lowaccumhi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_lowaccumhi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD]], [[SHR10]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD7]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD7]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR11]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR11]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %shr10) + ret i32 %add11 +} + +define i32 @mul_carry_use_lowaccum(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_lowaccum( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD6]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %add6) + ret i32 %add11 +} diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll new file mode 100644 index 0000000000000..fa21721f17762 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll @@ -0,0 +1,3019 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; https://alive2.llvm.org/ce/z/KuJPnU +define i64 @umulh(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64 +; CHECK-NEXT: ret i64 [[TMP4]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; Commutative ops should match in any order. Ops where operand order has been +; reversed from above are marked 'commuted'. As per instcombine contributors +; guide, constants are always canonicalized to RHS, so don't bother commuting +; constants. +define i64 @umulh__commuted(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__commuted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64 +; CHECK-NEXT: ret i64 [[TMP4]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %x_hi, %y_lo ; commuted + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %x_lo, %y_hi ; commuted + %y_lo_x_lo = mul nuw i64 %x_lo, %y_lo ; commuted + + ; Add cross terms + %cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted + + ; Final result accumulation + %intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %carry, %intermediate ; commuted + %hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted + + ret i64 %hw64 +} + +define i32 @mulh_src32(i32 %x, i32 %y) { + ; Extract low and high 16 bits +; CHECK-LABEL: define i32 @mulh_src32( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i64 [[TMP4]] to i32 +; CHECK-NEXT: ret i32 [[TMP5]] +; + %x_lo = and i32 %x, u0xffff ; x & 0xffffffff + %y_lo = and i32 %y, u0xffff ; y & 0xffffffff + %x_hi = lshr i32 %x, 16 ; x >> 16 + %y_hi = lshr i32 %y, 16 ; y >> 16 + + ; Cross products + %y_lo_x_hi = mul nuw i32 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i32 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i32 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i32 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i32 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i32 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i32 u0x10000, i32 0 ; if overflow, add 1 << 16 + + ; High 16 bits of low product + %y_lo_x_lo_hi = lshr i32 %y_lo_x_lo, 16 + + ; Low and high 16 bits of cross_sum + %cross_sum_lo = and i32 %cross_sum, u0xffff + %cross_sum_hi = lshr i32 %cross_sum, 16 + + %low_accum = add nuw nsw i32 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i32 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i32 %low_accum, 16 + %intermediate_plus_carry = add i32 %intermediate, %carry + %hw64 = add i32 %intermediate_plus_carry, %low_accum_hi + + ret i32 %hw64 +} + +define i128 @mulh_src128(i128 %x, i128 %y) { + ; Extract low and high 64 bits +; CHECK-LABEL: define i128 @mulh_src128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i256 [[TMP3]], 128 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128 +; CHECK-NEXT: ret i128 [[HW64]] +; + %x_lo = and i128 %x, u0xffffffffffffffff ; x & 0xffffffff + %y_lo = and i128 %y, u0xffffffffffffffff ; y & 0xffffffff + %x_hi = lshr i128 %x, 64 ; x >> 16 + %y_hi = lshr i128 %y, 64 ; y >> 16 + + ; Cross products + %y_lo_x_hi = mul nuw i128 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i128 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i128 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i128 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i128 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i128 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i128 u0x10000000000000000, i128 0 ; if overflow, add 1 << 16 + + ; High 16 bits of low product + %y_lo_x_lo_hi = lshr i128 %y_lo_x_lo, 64 + + ; Low and high 16 bits of cross_sum + %cross_sum_lo = and i128 %cross_sum, u0xffffffffffffffff + %cross_sum_hi = lshr i128 %cross_sum, 64 + + %low_accum = add nuw nsw i128 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i128 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i128 %low_accum, 64 + %intermediate_plus_carry = add i128 %intermediate, %carry + %hw64 = add i128 %intermediate_plus_carry, %low_accum_hi + + ret i128 %hw64 +} + +define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) { + ; Extract low and high 16 bits +; CHECK-LABEL: define <2 x i32> @mulh_v2i32( +; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[Y]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32) +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[HW64]] +; + %x_lo = and <2 x i32> %x, + %y_lo = and <2 x i32> %y, + %x_hi = lshr <2 x i32> %x, + %y_hi = lshr <2 x i32> %y, + + ; Cross products + %y_lo_x_hi = mul nuw <2 x i32> %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw <2 x i32> %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw <2 x i32> %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw <2 x i32> %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add <2 x i32> %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult <2 x i32> %cross_sum, %y_lo_x_hi + %carry = select <2 x i1> %carry_out, <2 x i32> , <2 x i32> + + ; High 16 bits of low product + %y_lo_x_lo_hi = lshr <2 x i32> %y_lo_x_lo, + + ; Low and high 16 bits of cross_sum + %cross_sum_lo = and <2 x i32> %cross_sum, + %cross_sum_hi = lshr <2 x i32> %cross_sum, + + %low_accum = add nuw nsw <2 x i32> %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw <2 x i32> %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr <2 x i32> %low_accum, + %intermediate_plus_carry = add <2 x i32> %intermediate, %carry + %hw64 = add <2 x i32> %intermediate_plus_carry, %low_accum_hi + + ret <2 x i32> %hw64 +} + +; https://alive2.llvm.org/ce/z/PPXtkR +define void @full_mul_int128(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[X]], [[Y]] +; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + ; Store high 64 bits + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + ; Reconstruct low 64 bits + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + ; Store low 64 bits + store i64 %lw64, ptr %p, align 8 + + ret void +} + + +; Negative tests + +define i64 @umulh_notandx(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notandx( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967294 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967294 ; x & 0xfffffffe + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notandy(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notandy( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967294 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967294 ; y & 0xfffffffe + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notshiftx(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notshiftx( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 16 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 16 ; x >> 16 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notshifty(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notshifty( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 16 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 16 ; y >> 16 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notcarry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notcarry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967295, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967295, i64 0 ; if overflow, add wrong value + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notxlo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notxlo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x ; y_lo * x + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notcrosssum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notcrosssum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = shl i64 [[Y_HI_X_LO]], 1 +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967294 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_hi_x_lo ; wrong crosssum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + + + +; Uses tests. + +; 'x_lo' can have more than 2 uses. +define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + call void (...) @llvm.fake.use(i64 %x_lo) + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi' can have more than 2 uses. +define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + call void (...) @llvm.fake.use(i64 %y_hi) + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_hi' must have no more than 2 uses. +define i64 @umulh__mul_use__y_lo_x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + call void (...) @llvm.fake.use(i64 %y_lo_x_hi) + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_hi' must have single use. +define i64 @umulh__mul_use__y_hi_x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + call void (...) @llvm.fake.use(i64 %y_hi_x_hi) + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_lo' must have single use. +define i64 @umulh__mul_use__y_hi_x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]]) +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + call void (...) @llvm.fake.use(i64 %y_hi_x_lo) + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo * x_lo' has a single use if only doing high part of multiply and 2 uses +; when doing both low/high parts. Doing the optimization when only doing the +; high part and there's a 2nd unrelated use here still results in less +; instructions and is likely profitable, so this seems ok. +define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[TMP5]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + call void (...) @llvm.fake.use(i64 %y_lo_x_lo) + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum' must have no more than 3 uses. +define i64 @umulh__mul_use__cross_sum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]]) +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + call void (...) @llvm.fake.use(i64 %cross_sum) + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'carry_out' must have single use. +define i64 @umulh__mul_use__carry_out(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__carry_out( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]]) +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + call void (...) @llvm.fake.use(i1 %carry_out) + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'carry' must have single use. +define i64 @umulh__mul_use__carry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__carry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]]) +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + call void (...) @llvm.fake.use(i64 %carry) + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo_x_lo_hi' must have single use. +define i64 @umulh__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]]) +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi) + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum_lo' must have single use. +define i64 @umulh__mul_use__cross_sum_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]]) +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + call void (...) @llvm.fake.use(i64 %cross_sum_lo) + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum_hi' must have single use. +define i64 @umulh__mul_use__cross_sum_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]]) +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + call void (...) @llvm.fake.use(i64 %cross_sum_hi) + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'low_accum' has a single use if only doing high part of multiply and 2 uses +; when doing both low/high parts. Unrelated use here, but still seems +; profitable. +define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__low_accum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul i64 [[Y]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul i64 [[Y_HI]], [[X]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[TMP5]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + call void (...) @llvm.fake.use(i64 %low_accum) + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'intermediate' must have single use. +define i64 @umulh__mul_use__intermediate(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__intermediate( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE]]) +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + call void (...) @llvm.fake.use(i64 %intermediate) + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'low_accum_hi' must have single use. +define i64 @umulh__mul_use__low_accum_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__low_accum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'intermediate_plus_carry' must have single use. +define i64 @umulh__mul_use__intermediate_plus_carry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__intermediate_plus_carry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE_PLUS_CARRY]]) +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + call void (...) @llvm.fake.use(i64 %intermediate_plus_carry) + + ret i64 %hw64 +} + + +; 'x_lo' can have multiple uses. +define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + call void (...) @llvm.fake.use(i64 %x_lo) + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo' can have multiple uses. +define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + call void (...) @llvm.fake.use(i64 %y_lo) + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'x_hi' can have multiple uses. +define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_HI]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + call void (...) @llvm.fake.use(i64 %x_hi) + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi' can have multiple uses. +define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + call void (...) @llvm.fake.use(i64 %y_hi) + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_hi' must have exactly 2 uses. +define void @full_mul_int128__mul_use__y_lo_x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + call void (...) @llvm.fake.use(i64 %y_lo_x_hi) + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi_x_hi' must have single use. +define void @full_mul_int128__mul_use__y_hi_x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + call void (...) @llvm.fake.use(i64 %y_hi_x_hi) + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi_x_lo' must have single use. +define void @full_mul_int128__mul_use__y_hi_x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]]) +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + call void (...) @llvm.fake.use(i64 %y_hi_x_lo) + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_lo' we allow multiple uses on y_lo_x_lo. +; TODO does not simplify like it should? +define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul i64 [[Y]], [[X_HI]] +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul i64 [[Y_HI]], [[X]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]]) +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM1:%.*]] = shl i64 [[TMP6]], 32 +; CHECK-NEXT: [[LW64:%.*]] = add i64 [[Y_LO_X_LO]], [[LOW_ACCUM1]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + call void (...) @llvm.fake.use(i64 %y_lo_x_lo) + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum' must have no more than 3 uses. +define void @full_mul_int128__mul_use__cross_sum(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]]) +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + call void (...) @llvm.fake.use(i64 %cross_sum) + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'carry_out' must have single use. +define void @full_mul_int128__mul_use__carry_out(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__carry_out( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]]) +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + call void (...) @llvm.fake.use(i1 %carry_out) + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'carry' must have single use. +define void @full_mul_int128__mul_use__carry(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__carry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]]) +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + call void (...) @llvm.fake.use(i64 %carry) + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_lo_hi' must have single use. +define void @full_mul_int128__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]]) +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi) + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum_lo' must have single use. +define void @full_mul_int128__mul_use__cross_sum_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]]) +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + call void (...) @llvm.fake.use(i64 %cross_sum_lo) + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum_hi' must have single use. +define void @full_mul_int128__mul_use__cross_sum_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]]) +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + call void (...) @llvm.fake.use(i64 %cross_sum_hi) + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum' must have exactly 2 uses if doing high multiply. +define void @full_mul_int128__mul_use__low_accum(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]]) +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + call void (...) @llvm.fake.use(i64 %low_accum) + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'upper_mid' must have single use. +define void @full_mul_int128__mul_use__upper_mid(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[UPPER_MID]]) +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP9]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + call void (...) @llvm.fake.use(i64 %upper_mid) + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum_hi' must have single use. +define void @full_mul_int128__mul_use__low_accum_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'upper_mid_with_cross' must have single use. +define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid_with_cross( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum_shifted' can have multiple uses. +define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = and i64 [[LW64]], -4294967296 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]]) +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_shifted) + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll new file mode 100644 index 0000000000000..257cc0315c72f --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll @@ -0,0 +1,858 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; https://alive2.llvm.org/ce/z/MSo5S_ +define i64 @umulh_variant(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[TMP5]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i32 @umulh_variant_i32(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @umulh_variant_i32( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 32 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i64 [[TMP4]] to i32 +; CHECK-NEXT: ret i32 [[HW64]] +; + %x_lo = and i32 %x, u0xffff + %y_lo = and i32 %y, u0xffff + %x_hi = lshr i32 %x, 16 + %y_hi = lshr i32 %y, 16 + + %t0 = mul nuw i32 %y_lo, %x_lo + %t1 = mul nuw i32 %y_lo, %x_hi + %t2 = mul nuw i32 %y_hi, %x_lo + %t3 = mul nuw i32 %y_hi, %x_hi + + %t0_hi = lshr i32 %t0, 16 + + %u0 = add nuw i32 %t0_hi, %t1 + %u0_lo = and i32 %u0, u0xffff + %u0_hi = lshr i32 %u0, 16 + %u1 = add nuw i32 %u0_lo, %t2 + %u1_hi = lshr i32 %u1, 16 + %u2 = add nuw i32 %u0_hi, %t3 + %hw64 = add nuw i32 %u2, %u1_hi + ret i32 %hw64 +} + +define <2 x i32> @umulh_variant_v2i32(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: define <2 x i32> @umulh_variant_v2i32( +; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[Y]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32) +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[HW64]] +; + %x_lo = and <2 x i32> %x, + %y_lo = and <2 x i32> %y, + %x_hi = lshr <2 x i32> %x, + %y_hi = lshr <2 x i32> %y, + + %t0 = mul nuw <2 x i32> %y_lo, %x_lo + %t1 = mul nuw <2 x i32> %y_lo, %x_hi + %t2 = mul nuw <2 x i32> %y_hi, %x_lo + %t3 = mul nuw <2 x i32> %y_hi, %x_hi + + %t0_hi = lshr <2 x i32> %t0, + + %u0 = add nuw <2 x i32> %t0_hi, %t1 + %u0_lo = and <2 x i32> %u0, + %u0_hi = lshr <2 x i32> %u0, + %u1 = add nuw <2 x i32> %u0_lo, %t2 + %u1_hi = lshr <2 x i32> %u1, + %u2 = add nuw <2 x i32> %u0_hi, %t3 + %hw64 = add nuw <2 x i32> %u2, %u1_hi + ret <2 x i32> %hw64 +} + +define i128 @umulh_variant_i128(i128 %x, i128 %y) { +; CHECK-LABEL: define i128 @umulh_variant_i128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i256 [[TMP3]], 128 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128 +; CHECK-NEXT: ret i128 [[HW64]] +; + %x_lo = and i128 %x, u0xffffffffffffffff + %y_lo = and i128 %y, u0xffffffffffffffff + %x_hi = lshr i128 %x, 64 + %y_hi = lshr i128 %y, 64 + + %t0 = mul nuw i128 %y_lo, %x_lo + %t1 = mul nuw i128 %y_lo, %x_hi + %t2 = mul nuw i128 %y_hi, %x_lo + %t3 = mul nuw i128 %y_hi, %x_hi + + %t0_hi = lshr i128 %t0, 64 + + %u0 = add nuw i128 %t0_hi, %t1 + %u0_lo = and i128 %u0, u0xffffffffffffffff + %u0_hi = lshr i128 %u0, 64 + %u1 = add nuw i128 %u0_lo, %t2 + %u1_hi = lshr i128 %u1, 64 + %u2 = add nuw i128 %u0_hi, %t3 + %hw64 = add nuw i128 %u2, %u1_hi + ret i128 %hw64 +} + +define i64 @umulh_variant_commuted(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_commuted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %x_lo, %y_lo + %t1 = mul nuw i64 %x_lo, %y_hi + %t2 = mul nuw i64 %x_hi, %y_lo + %t3 = mul nuw i64 %x_hi, %y_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t1, %t0_hi + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %t2, %u0_lo + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u1_hi, %u0_hi + %hw64 = add nuw i64 %t3, %u2 + ret i64 %hw64 +} + + + +; Negative tests + +define i64 @umulh_variant_notlox(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_notlox( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967294 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967294 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967294 ; wrong imm + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i64 @umulh_variant_nothiy(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_nothiy( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 16 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 16 ; wrong imm + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i64 @umulh_variant_notlowacc(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_notlowacc( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967294 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967294 ; wrong imm + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i64 @umulh_variant_notll(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_notll( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t3 = mul nuw i64 %y_lo, %x_lo ; swapped lolo and hihi + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t0 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + + + +; Use checks + +; 't0' can have more than one use. +define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + call void (...) @llvm.fake.use(i64 %t0) + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't1' can have more than one use. +define i64 @umulh_variant__mul_use__t1(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t1( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + call void (...) @llvm.fake.use(i64 %t1) + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't2' can have more than one use. +define i64 @umulh_variant__mul_use__t2(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t2( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T2]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + call void (...) @llvm.fake.use(i64 %t2) + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't3' must have single use. +define i64 @umulh_variant__mul_use__t3(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t3( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T3]]) +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + call void (...) @llvm.fake.use(i64 %t3) + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't0_hi' must have single use. +define i64 @umulh_variant__mul_use__t0_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0_HI]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + call void (...) @llvm.fake.use(i64 %t0_hi) + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0' must have single use. +define i64 @umulh_variant__mul_use__u0(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0]]) +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + call void (...) @llvm.fake.use(i64 %u0) + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0_lo' must have single use. +define i64 @umulh_variant__mul_use__u0_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0_LO]]) +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + call void (...) @llvm.fake.use(i64 %u0_lo) + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0_hi' must have single use. +define i64 @umulh_variant__mul_use__u0_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0_HI]]) +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + call void (...) @llvm.fake.use(i64 %u0_hi) + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u1' must have single use. +define i64 @umulh_variant__mul_use__u1(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U1]]) +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + call void (...) @llvm.fake.use(i64 %u1) + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u1_hi' must have single use. +define i64 @umulh_variant__mul_use__u1_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U1_HI]]) +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + call void (...) @llvm.fake.use(i64 %u1_hi) + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u2' must have single use. +define i64 @umulh_variant__mul_use__u2(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u2( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[U0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U1]], 4294967295 +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U3:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI1:%.*]] = lshr i64 [[U3]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U1_HI]], [[T3]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U2]]) +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI1]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + call void (...) @llvm.fake.use(i64 %u2) + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define [2 x i64] @XXH_mult64to128(i64 noundef %lhs, i64 noundef %rhs) { +; CHECK-LABEL: define [2 x i64] @XXH_mult64to128( +; CHECK-SAME: i64 noundef [[LHS:%.*]], i64 noundef [[RHS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[RHS]] to i128 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[LHS]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i128 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64 +; CHECK-NEXT: [[ADD16:%.*]] = trunc nuw i128 [[TMP3]] to i64 +; CHECK-NEXT: [[SHR102:%.*]] = mul i64 [[LHS]], [[RHS]] +; CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[SHR102]], 0 +; CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[ADD16]], 1 +; CHECK-NEXT: ret [2 x i64] [[DOTFCA_1_INSERT]] +; +entry: + %and = and i64 %lhs, 4294967295 + %and1 = and i64 %rhs, 4294967295 + %mul.i = mul nuw i64 %and1, %and + %shr = lshr i64 %lhs, 32 + %mul.i27 = mul nuw i64 %and1, %shr + %shr5 = lshr i64 %rhs, 32 + %mul.i28 = mul nuw i64 %shr5, %and + %mul.i29 = mul nuw i64 %shr5, %shr + %shr10 = lshr i64 %mul.i, 32 + %and11 = and i64 %mul.i27, 4294967295 + %add = add nuw i64 %and11, %mul.i28 + %add12 = add nuw i64 %add, %shr10 + %shr13 = lshr i64 %mul.i27, 32 + %shr14 = lshr i64 %add12, 32 + %add15 = add nuw i64 %shr13, %mul.i29 + %add16 = add nuw i64 %add15, %shr14 + %shl = shl i64 %add12, 32 + %and17 = and i64 %mul.i, 4294967295 + %or = or disjoint i64 %shl, %and17 + %.fca.0.insert = insertvalue [2 x i64] poison, i64 %or, 0 + %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %add16, 1 + ret [2 x i64] %.fca.1.insert +} + diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll new file mode 100644 index 0000000000000..307fc62a6b4ba --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll @@ -0,0 +1,530 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; Ladder4 variant. https://alive2.llvm.org/ce/z/tExFRs +define i32 @mul_ladder4(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %xl, %yl + %mullh = mul nuw i32 %xl, %yh + %mulhl = mul nuw i32 %xh, %yl + %mulhh = mul nuw i32 %xh, %yh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %shr8, %conv10 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %add, %conv12 + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %add16 = add nuw i32 %mulhh, %shr15 + %shr17 = lshr i32 %mulhl, 16 + %add18 = add nuw i32 %add16, %shr17 + %add19 = add nuw i32 %add18, %shr14 + ret i32 %add19 +} + +define <2 x i32> @mul_ladder4_v2i32(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: define <2 x i32> @mul_ladder4_v2i32( +; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[Y]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32) +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw <2 x i64> [[TMP3]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[ADD19]] +; +entry: + %xl = and <2 x i32> %x, + %xh = lshr <2 x i32> %x, + %yl = and <2 x i32> %y, + %yh = lshr <2 x i32> %y, + %mulll = mul nuw <2 x i32> %xl, %yl + %mullh = mul nuw <2 x i32> %xl, %yh + %mulhl = mul nuw <2 x i32> %xh, %yl + %mulhh = mul nuw <2 x i32> %xh, %yh + %shr8 = lshr <2 x i32> %mulll, + %conv10 = and <2 x i32> %mullh, + %add = add nuw nsw <2 x i32> %shr8, %conv10 + %conv12 = and <2 x i32> %mulhl, + %add13 = add nuw nsw <2 x i32> %add, %conv12 + %shr14 = lshr <2 x i32> %add13, + %shr15 = lshr <2 x i32> %mullh, + %add16 = add nuw <2 x i32> %mulhh, %shr15 + %shr17 = lshr <2 x i32> %mulhl, + %add18 = add nuw <2 x i32> %add16, %shr17 + %add19 = add nuw <2 x i32> %add18, %shr14 + ret <2 x i32> %add19 +} + +define i128 @mul_ladder4_i128(i128 %x, i128 %y) { +; CHECK-LABEL: define i128 @mul_ladder4_i128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i256 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i256 [[TMP2]], 128 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i256 [[TMP3]] to i128 +; CHECK-NEXT: ret i128 [[ADD19]] +; +entry: + %xl = and i128 %x, u0xffffffffffffffff + %xh = lshr i128 %x, 64 + %yl = and i128 %y, u0xffffffffffffffff + %yh = lshr i128 %y, 64 + %mulll = mul nuw i128 %xl, %yl + %mullh = mul nuw i128 %xl, %yh + %mulhl = mul nuw i128 %xh, %yl + %mulhh = mul nuw i128 %xh, %yh + %shr8 = lshr i128 %mulll, 64 + %conv10 = and i128 %mullh, u0xffffffffffffffff + %add = add nuw nsw i128 %shr8, %conv10 + %conv12 = and i128 %mulhl, u0xffffffffffffffff + %add13 = add nuw nsw i128 %add, %conv12 + %shr14 = lshr i128 %add13, 64 + %shr15 = lshr i128 %mullh, 64 + %add16 = add nuw i128 %mulhh, %shr15 + %shr17 = lshr i128 %mulhl, 64 + %add18 = add nuw i128 %add16, %shr17 + %add19 = add nuw i128 %add18, %shr14 + ret i128 %add19 +} + +define i32 @mul_ladder4_commutted(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_commutted( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add nuw i32 %shr14, %shr17 + %add18 = add nuw i32 %add16, %shr15 + %add19 = add nuw i32 %mulhh, %add18 + ret i32 %add19 +} + +define i32 @mul_ladder4_swap_hl_lh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_swap_hl_lh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %xl, %yl + %mullh = mul nuw i32 %xl, %yh + %mulhl = mul nuw i32 %xh, %yl + %mulhh = mul nuw i32 %xh, %yh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mulhl, 65535 + %add = add nuw nsw i32 %shr8, %conv10 + %conv12 = and i32 %mullh, 65535 + %add13 = add nuw nsw i32 %add, %conv12 + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mulhl, 16 + %add16 = add nuw i32 %mulhh, %shr15 + %shr17 = lshr i32 %mullh, 16 + %add18 = add nuw i32 %add16, %shr17 + %add19 = add nuw i32 %add18, %shr14 + ret i32 %add19 +} + + +; Negative tests + +define i32 @mul_ladder4_notlhhl(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_notlhhl( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]] +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]] +; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]] +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %xl, %yl + %mullh = mul nuw i32 %xl, %yh + %mulhl = mul nuw i32 %xh, %yl + %mulhh = mul nuw i32 %xh, %yh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mulhl, 65535 + %add = add nuw nsw i32 %shr8, %conv10 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %add, %conv12 + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mulhl, 16 + %add16 = add nuw i32 %mulhh, %shr15 + %shr17 = lshr i32 %mulhl, 16 + %add18 = add nuw i32 %add16, %shr17 + %add19 = add nuw i32 %add18, %shr14 + ret i32 %add19 +} + + + + + + +; Extra uses + +define i32 @mul_ladder4_use_add13(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_add13( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD13]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %add13) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_conv12(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_conv12( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV12]], [[SHR8]] +; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV13]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL1]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[CONV13]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %conv12) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_u0(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_u0( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV13]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL1]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %add) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_hl(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_hl( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MULHL]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %mulhl) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_lh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_lh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MULLH]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %mullh) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_conv10(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_conv10( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV12]], [[SHR8]] +; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV13]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL1]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[CONV12]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %conv10) + ret i32 %add19 +}