diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index bbbac45e225a6..279eff0fae0d1 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1457,6 +1457,256 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, return false; } +/// Match low part of 128-bit multiplication. +static bool foldMul128Low(Instruction &I) { + auto *Ty = I.getType(); + if (!Ty->isIntegerTy(64)) + return false; + + // (low_accum << 32) | lo(lo(y) * lo(x)) + Value *LowAccum = nullptr, *YLowXLow = nullptr; + if (!match(&I, m_c_DisjointOr( + m_OneUse(m_Shl(m_Value(LowAccum), m_SpecificInt(32))), + m_OneUse( + m_And(m_Value(YLowXLow), m_SpecificInt(0xffffffff)))))) + return false; + + // lo(cross_sum) + hi(lo(y) * lo(x)) + Value *CrossSum = nullptr; + if (!match( + LowAccum, + m_c_Add(m_OneUse(m_And(m_Value(CrossSum), m_SpecificInt(0xffffffff))), + m_OneUse(m_LShr(m_Specific(YLowXLow), m_SpecificInt(32))))) || + LowAccum->hasNUsesOrMore(3)) + return false; + + // (hi(y) * lo(x)) + (lo(y) * hi(x)) + Value *YHigh = nullptr, *XLow = nullptr, *YLowXHigh = nullptr; + if (!match(CrossSum, m_c_Add(m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XLow))), + m_Value(YLowXHigh))) || + CrossSum->hasNUsesOrMore(4)) + return false; + + // lo(y) * lo(x) + Value *YLow = nullptr; + if (!match(YLowXLow, m_c_Mul(m_Value(YLow), m_Specific(XLow))) || + YLowXLow->hasNUsesOrMore(3)) + return false; + + // lo(y) * hi(x) + Value *XHigh = nullptr; + if (!match(YLowXHigh, m_c_Mul(m_Specific(YLow), m_Value(XHigh))) || + !YLowXHigh->hasNUses(2)) + return false; + + Value *X = nullptr; + // lo(x) = x & 0xffffffff + if (!match(XLow, m_c_And(m_Value(X), m_SpecificInt(0xffffffff))) || + !XLow->hasNUses(2)) + return false; + // hi(x) = x >> 32 + if (!match(XHigh, m_LShr(m_Specific(X), m_SpecificInt(32))) || + !XHigh->hasNUses(2)) + return false; + + // Same for Y. + Value *Y = nullptr; + if (!match(YLow, m_c_And(m_Value(Y), m_SpecificInt(0xffffffff))) || + !YLow->hasNUses(2)) + return false; + if (!match(YHigh, m_LShr(m_Specific(Y), m_SpecificInt(32))) || + !YHigh->hasNUses(2)) + return false; + + IRBuilder<> Builder(&I); + Value *XExt = Builder.CreateZExt(X, Builder.getInt128Ty()); + Value *YExt = Builder.CreateZExt(Y, Builder.getInt128Ty()); + Value *Mul128 = Builder.CreateMul(XExt, YExt); + Value *Res = Builder.CreateTrunc(Mul128, Builder.getInt64Ty()); + I.replaceAllUsesWith(Res); + + return true; +} + +/// Match high part of 128-bit multiplication. +static bool foldMul128High(Instruction &I) { + auto *Ty = I.getType(); + if (!Ty->isIntegerTy(64)) + return false; + + // intermediate_plus_carry + hi(low_accum) + Value *IntermediatePlusCarry = nullptr, *LowAccum = nullptr; + if (!match(&I, + m_c_Add(m_OneUse(m_Value(IntermediatePlusCarry)), + m_OneUse(m_LShr(m_Value(LowAccum), m_SpecificInt(32)))))) + return false; + + // match: + // (((hi(y) * hi(x)) + carry) + hi(cross_sum)) + // or: + // ((hi(cross_sum) + (hi(y) * hi(x))) + carry) + CmpPredicate Pred; + Value *CrossSum = nullptr, *XHigh = nullptr, *YHigh = nullptr, + *Carry = nullptr; + if (!match(IntermediatePlusCarry, + m_c_Add(m_c_Add(m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XHigh))), + m_Value(Carry)), + m_OneUse(m_LShr(m_Value(CrossSum), m_SpecificInt(32))))) && + !match(IntermediatePlusCarry, + m_c_Add(m_OneUse(m_c_Add( + m_OneUse(m_LShr(m_Value(CrossSum), m_SpecificInt(32))), + m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XHigh))))), + m_Value(Carry)))) + return false; + + // (select (icmp ult cross_sum, (lo(y) * hi(x))), (1 << 32), 0) + Value *YLowXHigh = nullptr; + if (!match(Carry, + m_OneUse(m_Select(m_OneUse(m_ICmp(Pred, m_Specific(CrossSum), + m_Value(YLowXHigh))), + m_SpecificInt(4294967296), m_SpecificInt(0)))) || + Pred != ICmpInst::ICMP_ULT) + return false; + + // (hi(y) * lo(x)) + (lo(y) * hi(x)) + Value *XLow = nullptr; + if (!match(CrossSum, + m_c_Add(m_OneUse(m_c_Mul(m_Specific(YHigh), m_Value(XLow))), + m_Specific(YLowXHigh))) || + CrossSum->hasNUsesOrMore(4)) + return false; + + // lo(y) * hi(x) + Value *YLow = nullptr; + if (!match(YLowXHigh, m_c_Mul(m_Value(YLow), m_Specific(XHigh))) || + !YLowXHigh->hasNUses(2)) + return false; + + // lo(cross_sum) + hi(lo(y) * lo(x)) + Value *YLowXLow = nullptr; + if (!match(LowAccum, + m_c_Add(m_OneUse(m_c_And(m_Specific(CrossSum), + m_SpecificInt(0xffffffff))), + m_OneUse(m_LShr(m_Value(YLowXLow), m_SpecificInt(32))))) || + LowAccum->hasNUsesOrMore(3)) + return false; + + // lo(y) * lo(x) + // + // When only doing the high part there's a single use and 2 uses when doing + // full multiply. Given the low/high patterns are separate, it's non-trivial + // to vary the number of uses to check this, but applying the optimization + // when there's an unrelated use when only doing the high part still results + // in less instructions and is likely profitable, so an upper bound of 2 uses + // should be fine. + if (!match(YLowXLow, m_c_Mul(m_Specific(YLow), m_Specific(XLow))) || + YLowXLow->hasNUsesOrMore(3)) + return false; + + Value *X = nullptr; + // lo(x) = x & 0xffffffff + if (!match(XLow, m_c_And(m_Value(X), m_SpecificInt(0xffffffff))) || + !XLow->hasNUses(2)) + return false; + // hi(x) = x >> 32 + if (!match(XHigh, m_LShr(m_Specific(X), m_SpecificInt(32))) || + !XHigh->hasNUses(2)) + return false; + + // Same for Y. + Value *Y = nullptr; + if (!match(YLow, m_c_And(m_Value(Y), m_SpecificInt(0xffffffff))) || + !YLow->hasNUses(2)) + return false; + if (!match(YHigh, m_LShr(m_Specific(Y), m_SpecificInt(32))) || + !YHigh->hasNUses(2)) + return false; + + IRBuilder<> Builder(&I); + Value *XExt = Builder.CreateZExt(X, Builder.getInt128Ty()); + Value *YExt = Builder.CreateZExt(Y, Builder.getInt128Ty()); + Value *Mul128 = Builder.CreateMul(XExt, YExt); + Value *High = Builder.CreateLShr(Mul128, 64); + Value *Res = Builder.CreateTrunc(High, Builder.getInt64Ty()); + I.replaceAllUsesWith(Res); + + return true; +} + +/// Match another variant of high part of 128-bit multiplication. +/// +/// %t0 = mul nuw i64 %y_lo, %x_lo +/// %t1 = mul nuw i64 %y_lo, %x_hi +/// %t2 = mul nuw i64 %y_hi, %x_lo +/// %t3 = mul nuw i64 %y_hi, %x_hi +/// %t0_hi = lshr i64 %t0, 32 +/// %u0 = add nuw i64 %t0_hi, %t1 +/// %u0_lo = and i64 %u0, 4294967295 +/// %u0_hi = lshr i64 %u0, 32 +/// %u1 = add nuw i64 %u0_lo, %t2 +/// %u1_hi = lshr i64 %u1, 32 +/// %u2 = add nuw i64 %u0_hi, %t3 +/// %hw64 = add nuw i64 %u2, %u1_hi +static bool foldMul128HighVariant(Instruction &I) { + auto *Ty = I.getType(); + if (!Ty->isIntegerTy(64)) + return false; + + // hw64 = (hi(u0) + (hi(y) * hi(x)) + (lo(u0) + (hi(y) * lo(x)) >> 32)) + Value *U0 = nullptr, *XHigh = nullptr, *YHigh = nullptr, *XLow = nullptr; + if (!match( + &I, + m_c_Add(m_OneUse(m_c_Add( + m_OneUse(m_LShr(m_Value(U0), m_SpecificInt(32))), + m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XHigh))))), + m_OneUse(m_LShr( + m_OneUse(m_c_Add( + m_OneUse(m_c_And(m_Deferred(U0), + m_SpecificInt(0xffffffff))), + m_OneUse(m_c_Mul(m_Deferred(YHigh), m_Value(XLow))))), + m_SpecificInt(32)))))) + return false; + + // u0 = (hi(lo(y) * lo(x)) + (lo(y) * hi(x))) + Value *YLow = nullptr; + if (!match(U0, + m_c_Add(m_OneUse(m_LShr( + m_OneUse(m_c_Mul(m_Value(YLow), m_Specific(XLow))), + m_SpecificInt(32))), + m_OneUse(m_c_Mul(m_Deferred(YLow), m_Specific(XHigh))))) || + !U0->hasNUses(2)) + return false; + + Value *X = nullptr; + // lo(x) = x & 0xffffffff + if (!match(XLow, m_c_And(m_Value(X), m_SpecificInt(0xffffffff))) || + !XLow->hasNUses(2)) + return false; + // hi(x) = x >> 32 + if (!match(XHigh, m_LShr(m_Specific(X), m_SpecificInt(32))) || + !XHigh->hasNUses(2)) + return false; + + // Same for Y. + Value *Y = nullptr; + if (!match(YLow, m_c_And(m_Value(Y), m_SpecificInt(0xffffffff))) || + !YLow->hasNUses(2)) + return false; + if (!match(YHigh, m_LShr(m_Specific(Y), m_SpecificInt(32))) || + !YHigh->hasNUses(2)) + return false; + + IRBuilder<> Builder(&I); + Value *XExt = Builder.CreateZExt(X, Builder.getInt128Ty()); + Value *YExt = Builder.CreateZExt(Y, Builder.getInt128Ty()); + Value *Mul128 = Builder.CreateMul(XExt, YExt); + Value *High = Builder.CreateLShr(Mul128, 64); + Value *Res = Builder.CreateTrunc(High, Builder.getInt64Ty()); + I.replaceAllUsesWith(Res); + + return true; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -1486,6 +1736,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT, MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT); MadeChange |= foldPatternedLoads(I, DL); MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT); + MadeChange |= foldMul128Low(I); + MadeChange |= foldMul128High(I); + MadeChange |= foldMul128HighVariant(I); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make // bugs. diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh.ll new file mode 100644 index 0000000000000..7ffc86c2299ec --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh.ll @@ -0,0 +1,2571 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s + +; https://alive2.llvm.org/ce/z/KuJPnU +define i64 @umulh(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: ret i64 [[TMP4]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; https://alive2.llvm.org/ce/z/MSo5S_ +define i64 @umulh_variant(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[TMP5]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; Commutative ops should match in any order. Ops where operand order has been +; reversed from above are marked 'commuted'. As per instcombine contributors +; guide, constants are always canonicalized to RHS, so don't both commuting +; constants. +define i64 @umulh__commuted(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__commuted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: ret i64 [[TMP4]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %x_hi, %y_lo ; commuted + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %x_lo, %y_hi ; commuted + %y_lo_x_lo = mul nuw i64 %x_lo, %y_lo ; commuted + + ; Add cross terms + %cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 4294967295, %cross_sum ; commuted + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted + + ; Final result accumulation + %intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %carry, %intermediate ; commuted + %hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted + + ret i64 %hw64 +} + +define i64 @umulh_variant_commuted(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_commuted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[TMP5]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %x_lo, %y_lo ; commuted + %t1 = mul nuw i64 %x_hi, %y_lo ; commuted + %t2 = mul nuw i64 %x_lo, %y_hi ; commuted + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t1, %t0_hi ; commuted + %u0_lo = and i64 4294967295, %u0 ; commuted + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %t2, %u0_lo ; commuted + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %t3, %u0_hi ; commuted + %hw64 = add nuw i64 %u1_hi, %u2 ; commuted + ret i64 %hw64 +} + +; https://alive2.llvm.org/ce/z/PPXtkR +define void @full_mul_int128(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP6:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP7:%.*]] = mul i128 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i128 [[TMP7]] to i64 +; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + ; Store high 64 bits + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + ; Reconstruct low 64 bits + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + ; Store low 64 bits + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; Negative tests + +; 'x_lo' must have exactly 2 uses. +define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__x_lo( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + call void (...) @llvm.fake.use(i64 %x_lo) + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo' must have exactly 2 uses. +define i64 @umulh__mul_use__y_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + call void (...) @llvm.fake.use(i64 %y_lo) + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'x_hi' must have exactly 2 uses. +define i64 @umulh__mul_use__x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__x_hi( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + call void (...) @llvm.fake.use(i64 %x_hi) + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi' must have exactly 2 uses. +define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + call void (...) @llvm.fake.use(i64 %y_hi) + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo * x_hi' must have exactly 2 uses. +define i64 @umulh__mul_use__y_lo_x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_hi( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + call void (...) @llvm.fake.use(i64 %y_lo_x_hi) + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_hi' must have single use. +define i64 @umulh__mul_use__y_hi_x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_hi( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + call void (...) @llvm.fake.use(i64 %y_hi_x_hi) + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_lo' must have single use. +define i64 @umulh__mul_use__y_hi_x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_lo( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + call void (...) @llvm.fake.use(i64 %y_hi_x_lo) + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo * x_lo' has a single use if only doing high part of multiply and 2 uses +; when doing both low/high parts. Doing the optimization when only doing the +; high part and there's a 2nd unrelated use here still results in less +; instructions and is likely profitable, so this seems ok. +define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[TMP5]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + call void (...) @llvm.fake.use(i64 %y_lo_x_lo) + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum' must have no more than 3 uses. +define i64 @umulh__mul_use__cross_sum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + call void (...) @llvm.fake.use(i64 %cross_sum) + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'carry_out' must have single use. +define i64 @umulh__mul_use__carry_out(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__carry_out( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + call void (...) @llvm.fake.use(i1 %carry_out) + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'carry' must have single use. +define i64 @umulh__mul_use__carry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__carry( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + call void (...) @llvm.fake.use(i64 %carry) + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo_x_lo_hi' must have single use. +define i64 @umulh__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo_hi( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi) + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum_lo' must have single use. +define i64 @umulh__mul_use__cross_sum_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_lo( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + call void (...) @llvm.fake.use(i64 %cross_sum_lo) + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum_hi' must have single use. +define i64 @umulh__mul_use__cross_sum_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_hi( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + call void (...) @llvm.fake.use(i64 %cross_sum_hi) + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'low_accum' has a single use if only doing high part of multiply and 2 uses +; when doing both low/high parts. Unrelated use here, but still seems +; profitable. +define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__low_accum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: ret i64 [[TMP5]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + call void (...) @llvm.fake.use(i64 %low_accum) + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'intermediate' must have single use. +define i64 @umulh__mul_use__intermediate(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__intermediate( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + call void (...) @llvm.fake.use(i64 %intermediate) + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'low_accum_hi' must have single use. +define i64 @umulh__mul_use__low_accum_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__low_accum_hi( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'intermediate_plus_carry' must have single use. +define i64 @umulh__mul_use__intermediate_plus_carry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__intermediate_plus_carry( +; CHECK-NOT: i128 + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + call void (...) @llvm.fake.use(i64 %intermediate_plus_carry) + + ret i64 %hw64 +} + +; 'x_lo' must have exactly 2 uses. +define i64 @umulh_variant__mul_use__x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__x_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + call void (...) @llvm.fake.use(i64 %x_lo) + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'y_lo' must have exactly 2 uses. +define i64 @umulh_variant__mul_use__y_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__y_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + call void (...) @llvm.fake.use(i64 %y_lo) + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'x_hi' must have exactly 2 uses. +define i64 @umulh_variant__mul_use__x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__x_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + call void (...) @llvm.fake.use(i64 %x_hi) + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'y_hi' must have exactly 2 uses. +define i64 @umulh_variant__mul_use__y_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__y_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + call void (...) @llvm.fake.use(i64 %y_hi) + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't0' must have single use. +define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + call void (...) @llvm.fake.use(i64 %t0) + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't1' must have single use. +define i64 @umulh_variant__mul_use__t1(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t1( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + call void (...) @llvm.fake.use(i64 %t1) + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't2' must have single use. +define i64 @umulh_variant__mul_use__t2(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t2( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + call void (...) @llvm.fake.use(i64 %t2) + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't3' must have single use. +define i64 @umulh_variant__mul_use__t3(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t3( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + call void (...) @llvm.fake.use(i64 %t3) + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't0_hi' must have single use. +define i64 @umulh_variant__mul_use__t0_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + call void (...) @llvm.fake.use(i64 %t0_hi) + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0' must have single use. +define i64 @umulh_variant__mul_use__u0(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + call void (...) @llvm.fake.use(i64 %u0) + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0_lo' must have single use. +define i64 @umulh_variant__mul_use__u0_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + call void (...) @llvm.fake.use(i64 %u0_lo) + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0_hi' must have single use. +define i64 @umulh_variant__mul_use__u0_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + call void (...) @llvm.fake.use(i64 %u0_hi) + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u1' must have single use. +define i64 @umulh_variant__mul_use__u1(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + call void (...) @llvm.fake.use(i64 %u1) + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u1_hi' must have single use. +define i64 @umulh_variant__mul_use__u1_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + call void (...) @llvm.fake.use(i64 %u1_hi) + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u2' must have single use. +define i64 @umulh_variant__mul_use__u2(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u2( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + call void (...) @llvm.fake.use(i64 %u2) + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'x_lo' must have exactly 2 uses. +define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__x_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + call void (...) @llvm.fake.use(i64 %x_lo) + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo' must have exactly 2 uses. +define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + call void (...) @llvm.fake.use(i64 %y_lo) + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'x_hi' must have exactly 2 uses. +define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + call void (...) @llvm.fake.use(i64 %x_hi) + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi' must have exactly 2 uses. +define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + call void (...) @llvm.fake.use(i64 %y_hi) + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_hi' must have exactly 2 uses. +define void @full_mul_int128__mul_use__y_lo_x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + call void (...) @llvm.fake.use(i64 %y_lo_x_hi) + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi_x_hi' must have single use. High pattern doesn't apply but the low +; pattern does, so there's still i128. +define void @full_mul_int128__mul_use__y_hi_x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + call void (...) @llvm.fake.use(i64 %y_hi_x_hi) + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi_x_lo' must have single use. +define void @full_mul_int128__mul_use__y_hi_x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + call void (...) @llvm.fake.use(i64 %y_hi_x_lo) + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_lo' must have exactly 2 uses for full multiply. +define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + call void (...) @llvm.fake.use(i64 %y_lo_x_lo) + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum' must have no more than 3 uses. +define void @full_mul_int128__mul_use__cross_sum(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + call void (...) @llvm.fake.use(i64 %cross_sum) + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'carry_out' must have single use. +; High pattern doesn't apply, but low pattern does so there's i128. +define void @full_mul_int128__mul_use__carry_out(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__carry_out( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]]) +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + call void (...) @llvm.fake.use(i1 %carry_out) + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'carry' must have single use. +; High pattern doesn't apply, but low pattern does so there's i128. +define void @full_mul_int128__mul_use__carry(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__carry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]]) +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + call void (...) @llvm.fake.use(i64 %carry) + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_lo_hi' must have single use. +define void @full_mul_int128__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_hi( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi) + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum_lo' must have single use. +define void @full_mul_int128__mul_use__cross_sum_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_lo( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + call void (...) @llvm.fake.use(i64 %cross_sum_lo) + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum_hi' must have single use. +; High pattern doesn't apply, but low pattern does so there's i128. +define void @full_mul_int128__mul_use__cross_sum_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]]) +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + call void (...) @llvm.fake.use(i64 %cross_sum_hi) + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum' must have exactly 2 uses if doing high multiply. +define void @full_mul_int128__mul_use__low_accum(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum( +; CHECK-NOT: i128 + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + call void (...) @llvm.fake.use(i64 %low_accum) + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'upper_mid' must have single use. +; High pattern doesn't apply, but low pattern does so there's i128. +define void @full_mul_int128__mul_use__upper_mid(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[UPPER_MID]]) +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP7:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP8:%.*]] = mul i128 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i128 [[TMP8]] to i64 +; CHECK-NEXT: store i64 [[TMP9]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + call void (...) @llvm.fake.use(i64 %upper_mid) + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum_hi' must have single use. +; High pattern doesn't apply, but low pattern does so there's i128. +define void @full_mul_int128__mul_use__low_accum_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'upper_mid_with_cross' must have single use. +; High pattern doesn't apply, but low pattern does so there's i128. +define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid_with_cross( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum_shifted' must have single use. +; Low pattern doesn't apply, but high pattern does so there's i128. +define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]]) +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_shifted) + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_lo_lo' must have single use. +; Low pattern doesn't apply, but high pattern does so there's i128. +define void @full_mul_int128__mul_use__y_lo_x_lo_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64 +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_LO]]) +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + call void (...) @llvm.fake.use(i64 %y_lo_x_lo_lo) + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +}