diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index bbbac45e225a6..279eff0fae0d1 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1457,6 +1457,256 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
   return false;
 }
 
+/// Match low part of 128-bit multiplication.
+static bool foldMul128Low(Instruction &I) {
+  auto *Ty = I.getType();
+  if (!Ty->isIntegerTy(64))
+    return false;
+
+  // (low_accum << 32) | lo(lo(y) * lo(x))
+  Value *LowAccum = nullptr, *YLowXLow = nullptr;
+  if (!match(&I, m_c_DisjointOr(
+                     m_OneUse(m_Shl(m_Value(LowAccum), m_SpecificInt(32))),
+                     m_OneUse(
+                         m_And(m_Value(YLowXLow), m_SpecificInt(0xffffffff))))))
+    return false;
+
+  // lo(cross_sum) + hi(lo(y) * lo(x))
+  Value *CrossSum = nullptr;
+  if (!match(
+          LowAccum,
+          m_c_Add(m_OneUse(m_And(m_Value(CrossSum), m_SpecificInt(0xffffffff))),
+                  m_OneUse(m_LShr(m_Specific(YLowXLow), m_SpecificInt(32))))) ||
+      LowAccum->hasNUsesOrMore(3))
+    return false;
+
+  // (hi(y) * lo(x)) + (lo(y) * hi(x))
+  Value *YHigh = nullptr, *XLow = nullptr, *YLowXHigh = nullptr;
+  if (!match(CrossSum, m_c_Add(m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XLow))),
+                               m_Value(YLowXHigh))) ||
+      CrossSum->hasNUsesOrMore(4))
+    return false;
+
+  // lo(y) * lo(x)
+  Value *YLow = nullptr;
+  if (!match(YLowXLow, m_c_Mul(m_Value(YLow), m_Specific(XLow))) ||
+      YLowXLow->hasNUsesOrMore(3))
+    return false;
+
+  // lo(y) * hi(x)
+  Value *XHigh = nullptr;
+  if (!match(YLowXHigh, m_c_Mul(m_Specific(YLow), m_Value(XHigh))) ||
+      !YLowXHigh->hasNUses(2))
+    return false;
+
+  Value *X = nullptr;
+  // lo(x) = x & 0xffffffff
+  if (!match(XLow, m_c_And(m_Value(X), m_SpecificInt(0xffffffff))) ||
+      !XLow->hasNUses(2))
+    return false;
+  // hi(x) = x >> 32
+  if (!match(XHigh, m_LShr(m_Specific(X), m_SpecificInt(32))) ||
+      !XHigh->hasNUses(2))
+    return false;
+
+  // Same for Y.
+  Value *Y = nullptr;
+  if (!match(YLow, m_c_And(m_Value(Y), m_SpecificInt(0xffffffff))) ||
+      !YLow->hasNUses(2))
+    return false;
+  if (!match(YHigh, m_LShr(m_Specific(Y), m_SpecificInt(32))) ||
+      !YHigh->hasNUses(2))
+    return false;
+
+  IRBuilder<> Builder(&I);
+  Value *XExt = Builder.CreateZExt(X, Builder.getInt128Ty());
+  Value *YExt = Builder.CreateZExt(Y, Builder.getInt128Ty());
+  Value *Mul128 = Builder.CreateMul(XExt, YExt);
+  Value *Res = Builder.CreateTrunc(Mul128, Builder.getInt64Ty());
+  I.replaceAllUsesWith(Res);
+
+  return true;
+}
+
+/// Match high part of 128-bit multiplication.
+static bool foldMul128High(Instruction &I) {
+  auto *Ty = I.getType();
+  if (!Ty->isIntegerTy(64))
+    return false;
+
+  // intermediate_plus_carry + hi(low_accum)
+  Value *IntermediatePlusCarry = nullptr, *LowAccum = nullptr;
+  if (!match(&I,
+             m_c_Add(m_OneUse(m_Value(IntermediatePlusCarry)),
+                     m_OneUse(m_LShr(m_Value(LowAccum), m_SpecificInt(32))))))
+    return false;
+
+  // match:
+  //   (((hi(y) * hi(x)) + carry) + hi(cross_sum))
+  // or:
+  //   ((hi(cross_sum) + (hi(y) * hi(x))) + carry)
+  CmpPredicate Pred;
+  Value *CrossSum = nullptr, *XHigh = nullptr, *YHigh = nullptr,
+        *Carry = nullptr;
+  if (!match(IntermediatePlusCarry,
+             m_c_Add(m_c_Add(m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XHigh))),
+                             m_Value(Carry)),
+                     m_OneUse(m_LShr(m_Value(CrossSum), m_SpecificInt(32))))) &&
+      !match(IntermediatePlusCarry,
+             m_c_Add(m_OneUse(m_c_Add(
+                         m_OneUse(m_LShr(m_Value(CrossSum), m_SpecificInt(32))),
+                         m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XHigh))))),
+                     m_Value(Carry))))
+    return false;
+
+  // (select (icmp ult cross_sum, (lo(y) * hi(x))), (1 << 32), 0)
+  Value *YLowXHigh = nullptr;
+  if (!match(Carry,
+             m_OneUse(m_Select(m_OneUse(m_ICmp(Pred, m_Specific(CrossSum),
+                                               m_Value(YLowXHigh))),
+                               m_SpecificInt(4294967296), m_SpecificInt(0)))) ||
+      Pred != ICmpInst::ICMP_ULT)
+    return false;
+
+  // (hi(y) * lo(x)) + (lo(y) * hi(x))
+  Value *XLow = nullptr;
+  if (!match(CrossSum,
+             m_c_Add(m_OneUse(m_c_Mul(m_Specific(YHigh), m_Value(XLow))),
+                     m_Specific(YLowXHigh))) ||
+      CrossSum->hasNUsesOrMore(4))
+    return false;
+
+  // lo(y) * hi(x)
+  Value *YLow = nullptr;
+  if (!match(YLowXHigh, m_c_Mul(m_Value(YLow), m_Specific(XHigh))) ||
+      !YLowXHigh->hasNUses(2))
+    return false;
+
+  // lo(cross_sum) + hi(lo(y) * lo(x))
+  Value *YLowXLow = nullptr;
+  if (!match(LowAccum,
+             m_c_Add(m_OneUse(m_c_And(m_Specific(CrossSum),
+                                      m_SpecificInt(0xffffffff))),
+                     m_OneUse(m_LShr(m_Value(YLowXLow), m_SpecificInt(32))))) ||
+      LowAccum->hasNUsesOrMore(3))
+    return false;
+
+  // lo(y) * lo(x)
+  //
+  // When only doing the high part there's a single use and 2 uses when doing
+  // full multiply. Given the low/high patterns are separate, it's non-trivial
+  // to vary the number of uses to check this, but applying the optimization
+  // when there's an unrelated use when only doing the high part still results
+  // in less instructions and is likely profitable, so an upper bound of 2 uses
+  // should be fine.
+  if (!match(YLowXLow, m_c_Mul(m_Specific(YLow), m_Specific(XLow))) ||
+      YLowXLow->hasNUsesOrMore(3))
+    return false;
+
+  Value *X = nullptr;
+  // lo(x) = x & 0xffffffff
+  if (!match(XLow, m_c_And(m_Value(X), m_SpecificInt(0xffffffff))) ||
+      !XLow->hasNUses(2))
+    return false;
+  // hi(x) = x >> 32
+  if (!match(XHigh, m_LShr(m_Specific(X), m_SpecificInt(32))) ||
+      !XHigh->hasNUses(2))
+    return false;
+
+  // Same for Y.
+  Value *Y = nullptr;
+  if (!match(YLow, m_c_And(m_Value(Y), m_SpecificInt(0xffffffff))) ||
+      !YLow->hasNUses(2))
+    return false;
+  if (!match(YHigh, m_LShr(m_Specific(Y), m_SpecificInt(32))) ||
+      !YHigh->hasNUses(2))
+    return false;
+
+  IRBuilder<> Builder(&I);
+  Value *XExt = Builder.CreateZExt(X, Builder.getInt128Ty());
+  Value *YExt = Builder.CreateZExt(Y, Builder.getInt128Ty());
+  Value *Mul128 = Builder.CreateMul(XExt, YExt);
+  Value *High = Builder.CreateLShr(Mul128, 64);
+  Value *Res = Builder.CreateTrunc(High, Builder.getInt64Ty());
+  I.replaceAllUsesWith(Res);
+
+  return true;
+}
+
+/// Match another variant of high part of 128-bit multiplication.
+///
+///   %t0 = mul nuw i64 %y_lo, %x_lo
+///   %t1 = mul nuw i64 %y_lo, %x_hi
+///   %t2 = mul nuw i64 %y_hi, %x_lo
+///   %t3 = mul nuw i64 %y_hi, %x_hi
+///   %t0_hi = lshr i64 %t0, 32
+///   %u0 = add nuw i64 %t0_hi, %t1
+///   %u0_lo = and i64 %u0, 4294967295
+///   %u0_hi = lshr i64 %u0, 32
+///   %u1 = add nuw i64 %u0_lo, %t2
+///   %u1_hi = lshr i64 %u1, 32
+///   %u2 = add nuw i64 %u0_hi, %t3
+///   %hw64 = add nuw i64 %u2, %u1_hi
+static bool foldMul128HighVariant(Instruction &I) {
+  auto *Ty = I.getType();
+  if (!Ty->isIntegerTy(64))
+    return false;
+
+  // hw64 = (hi(u0) + (hi(y) * hi(x)) + (lo(u0) + (hi(y) * lo(x)) >> 32))
+  Value *U0 = nullptr, *XHigh = nullptr, *YHigh = nullptr, *XLow = nullptr;
+  if (!match(
+          &I,
+          m_c_Add(m_OneUse(m_c_Add(
+                      m_OneUse(m_LShr(m_Value(U0), m_SpecificInt(32))),
+                      m_OneUse(m_c_Mul(m_Value(YHigh), m_Value(XHigh))))),
+                  m_OneUse(m_LShr(
+                      m_OneUse(m_c_Add(
+                          m_OneUse(m_c_And(m_Deferred(U0),
+                                           m_SpecificInt(0xffffffff))),
+                          m_OneUse(m_c_Mul(m_Deferred(YHigh), m_Value(XLow))))),
+                      m_SpecificInt(32))))))
+    return false;
+
+  // u0 = (hi(lo(y) * lo(x)) + (lo(y) * hi(x)))
+  Value *YLow = nullptr;
+  if (!match(U0,
+             m_c_Add(m_OneUse(m_LShr(
+                         m_OneUse(m_c_Mul(m_Value(YLow), m_Specific(XLow))),
+                         m_SpecificInt(32))),
+                     m_OneUse(m_c_Mul(m_Deferred(YLow), m_Specific(XHigh))))) ||
+      !U0->hasNUses(2))
+    return false;
+
+  Value *X = nullptr;
+  // lo(x) = x & 0xffffffff
+  if (!match(XLow, m_c_And(m_Value(X), m_SpecificInt(0xffffffff))) ||
+      !XLow->hasNUses(2))
+    return false;
+  // hi(x) = x >> 32
+  if (!match(XHigh, m_LShr(m_Specific(X), m_SpecificInt(32))) ||
+      !XHigh->hasNUses(2))
+    return false;
+
+  // Same for Y.
+  Value *Y = nullptr;
+  if (!match(YLow, m_c_And(m_Value(Y), m_SpecificInt(0xffffffff))) ||
+      !YLow->hasNUses(2))
+    return false;
+  if (!match(YHigh, m_LShr(m_Specific(Y), m_SpecificInt(32))) ||
+      !YHigh->hasNUses(2))
+    return false;
+
+  IRBuilder<> Builder(&I);
+  Value *XExt = Builder.CreateZExt(X, Builder.getInt128Ty());
+  Value *YExt = Builder.CreateZExt(Y, Builder.getInt128Ty());
+  Value *Mul128 = Builder.CreateMul(XExt, YExt);
+  Value *High = Builder.CreateLShr(Mul128, 64);
+  Value *Res = Builder.CreateTrunc(High, Builder.getInt64Ty());
+  I.replaceAllUsesWith(Res);
+
+  return true;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
@@ -1486,6 +1736,9 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
+      MadeChange |= foldMul128Low(I);
+      MadeChange |= foldMul128High(I);
+      MadeChange |= foldMul128HighVariant(I);
       // NOTE: This function introduces erasing of the instruction `I`, so it
       // needs to be called at the end of this sequence, otherwise we may make
       // bugs.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh.ll
new file mode 100644
index 0000000000000..7ffc86c2299ec
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh.ll
@@ -0,0 +1,2571 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
+
+; https://alive2.llvm.org/ce/z/KuJPnU
+define i64 @umulh(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i128 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; https://alive2.llvm.org/ce/z/MSo5S_
+define i64 @umulh_variant(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; Commutative ops should match in any order. Ops where operand order has been
+; reversed from above are marked 'commuted'. As per instcombine contributors
+; guide, constants are always canonicalized to RHS, so don't both commuting
+; constants.
+define i64 @umulh__commuted(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__commuted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i128 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %x_hi, %y_lo       ; commuted
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %x_lo, %y_hi       ; commuted
+  %y_lo_x_lo = mul nuw i64 %x_lo, %y_lo       ; commuted
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 4294967295, %cross_sum ; commuted
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %carry, %intermediate ; commuted
+  %hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted
+
+  ret i64 %hw64
+}
+
+define i64 @umulh_variant_commuted(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant_commuted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %x_lo, %y_lo ; commuted
+  %t1 = mul nuw i64 %x_hi, %y_lo ; commuted
+  %t2 = mul nuw i64 %x_lo, %y_hi ; commuted
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t1, %t0_hi    ; commuted
+  %u0_lo = and i64 4294967295, %u0 ; commuted
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %t2, %u0_lo    ; commuted
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %t3, %u0_hi    ; commuted
+  %hw64 = add nuw i64 %u1_hi, %u2  ; commuted
+  ret i64 %hw64
+}
+
+; https://alive2.llvm.org/ce/z/PPXtkR
+define void @full_mul_int128(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i128 [[TMP2]], 64
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i128 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i128 [[TMP7]] to i64
+; CHECK-NEXT:    store i64 [[TMP8]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  ; Store high 64 bits
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  ; Reconstruct low 64 bits
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  ; Store low 64 bits
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; Negative tests
+
+; 'x_lo' must have exactly 2 uses.
+define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__x_lo(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  call void (...) @llvm.fake.use(i64 %x_lo)
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'y_lo' must have exactly 2 uses.
+define i64 @umulh__mul_use__y_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_lo(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  call void (...) @llvm.fake.use(i64 %y_lo)
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'x_hi' must have exactly 2 uses.
+define i64 @umulh__mul_use__x_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__x_hi(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  call void (...) @llvm.fake.use(i64 %x_hi)
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'y_hi' must have exactly 2 uses.
+define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_hi(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+  call void (...) @llvm.fake.use(i64 %y_hi)
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'y_lo * x_hi' must have exactly 2 uses.
+define i64 @umulh__mul_use__y_lo_x_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_hi(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'y_hi * x_hi' must have single use.
+define i64 @umulh__mul_use__y_hi_x_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_hi(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'y_hi * x_lo' must have single use.
+define i64 @umulh__mul_use__y_hi_x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_lo(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'y_lo * x_lo' has a single use if only doing high part of multiply and 2 uses
+; when doing both low/high parts. Doing the optimization when only doing the
+; high part and there's a 2nd unrelated use here still results in less
+; instructions and is likely profitable, so this seems ok.
+define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+  call void (...) @llvm.fake.use(i64 %y_lo_x_lo)
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'cross_sum' must have no more than 3 uses.
+define i64 @umulh__mul_use__cross_sum(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  call void (...) @llvm.fake.use(i64 %cross_sum)
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'carry_out' must have single use.
+define i64 @umulh__mul_use__carry_out(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__carry_out(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  call void (...) @llvm.fake.use(i1 %carry_out)
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'carry' must have single use.
+define i64 @umulh__mul_use__carry(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__carry(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+  call void (...) @llvm.fake.use(i64 %carry)
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'y_lo_x_lo_hi' must have single use.
+define i64 @umulh__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo_hi(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+  call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'cross_sum_lo' must have single use.
+define i64 @umulh__mul_use__cross_sum_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_lo(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  call void (...) @llvm.fake.use(i64 %cross_sum_lo)
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'cross_sum_hi' must have single use.
+define i64 @umulh__mul_use__cross_sum_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_hi(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+  call void (...) @llvm.fake.use(i64 %cross_sum_hi)
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'low_accum' has a single use if only doing high part of multiply and 2 uses
+; when doing both low/high parts. Unrelated use here, but still seems
+; profitable.
+define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__low_accum(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    ret i64 [[TMP5]]
+;
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+  call void (...) @llvm.fake.use(i64 %low_accum)
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'intermediate' must have single use.
+define i64 @umulh__mul_use__intermediate(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__intermediate(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  call void (...) @llvm.fake.use(i64 %intermediate)
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'low_accum_hi' must have single use.
+define i64 @umulh__mul_use__low_accum_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__low_accum_hi(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  call void (...) @llvm.fake.use(i64 %low_accum_hi)
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+  ret i64 %hw64
+}
+
+; 'intermediate_plus_carry' must have single use.
+define i64 @umulh__mul_use__intermediate_plus_carry(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__intermediate_plus_carry(
+; CHECK-NOT: i128
+  ; Extract low and high 32 bits
+  %x_lo = and i64 %x, 4294967295              ; x & 0xffffffff
+  %y_lo = and i64 %y, 4294967295              ; y & 0xffffffff
+  %x_hi = lshr i64 %x, 32                     ; x >> 32
+  %y_hi = lshr i64 %y, 32                     ; y >> 32
+
+  ; Cross products
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi       ; y_lo * x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi       ; y_hi * x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo       ; y_hi * x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo       ; y_lo * x_lo
+
+  ; Add cross terms
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+  ; Carry if overflowed
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+  ; High 32 bits of low product
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  ; Low and high 32 bits of cross_sum
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  ; Final result accumulation
+  %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %intermediate_plus_carry = add i64 %intermediate, %carry
+  %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+  call void (...) @llvm.fake.use(i64 %intermediate_plus_carry)
+
+  ret i64 %hw64
+}
+
+; 'x_lo' must have exactly 2 uses.
+define i64 @umulh_variant__mul_use__x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__x_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  call void (...) @llvm.fake.use(i64 %x_lo)
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'y_lo' must have exactly 2 uses.
+define i64 @umulh_variant__mul_use__y_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__y_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  call void (...) @llvm.fake.use(i64 %y_lo)
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'x_hi' must have exactly 2 uses.
+define i64 @umulh_variant__mul_use__x_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__x_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  call void (...) @llvm.fake.use(i64 %x_hi)
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'y_hi' must have exactly 2 uses.
+define i64 @umulh_variant__mul_use__y_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__y_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+  call void (...) @llvm.fake.use(i64 %y_hi)
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 't0' must have single use.
+define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  call void (...) @llvm.fake.use(i64 %t0)
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 't1' must have single use.
+define i64 @umulh_variant__mul_use__t1(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t1(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  call void (...) @llvm.fake.use(i64 %t1)
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 't2' must have single use.
+define i64 @umulh_variant__mul_use__t2(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t2(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  call void (...) @llvm.fake.use(i64 %t2)
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 't3' must have single use.
+define i64 @umulh_variant__mul_use__t3(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t3(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+  call void (...) @llvm.fake.use(i64 %t3)
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 't0_hi' must have single use.
+define i64 @umulh_variant__mul_use__t0_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+  call void (...) @llvm.fake.use(i64 %t0_hi)
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'u0' must have single use.
+define i64 @umulh_variant__mul_use__u0(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  call void (...) @llvm.fake.use(i64 %u0)
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'u0_lo' must have single use.
+define i64 @umulh_variant__mul_use__u0_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  call void (...) @llvm.fake.use(i64 %u0_lo)
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'u0_hi' must have single use.
+define i64 @umulh_variant__mul_use__u0_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  call void (...) @llvm.fake.use(i64 %u0_hi)
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'u1' must have single use.
+define i64 @umulh_variant__mul_use__u1(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  call void (...) @llvm.fake.use(i64 %u1)
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'u1_hi' must have single use.
+define i64 @umulh_variant__mul_use__u1_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  call void (...) @llvm.fake.use(i64 %u1_hi)
+  %u2 = add nuw i64 %u0_hi, %t3
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'u2' must have single use.
+define i64 @umulh_variant__mul_use__u2(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u2(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %y_lo, %x_lo
+  %t1 = mul nuw i64 %y_lo, %x_hi
+  %t2 = mul nuw i64 %y_hi, %x_lo
+  %t3 = mul nuw i64 %y_hi, %x_hi
+
+  %t0_hi = lshr i64 %t0, 32
+
+  %u0 = add nuw i64 %t0_hi, %t1
+  %u0_lo = and i64 %u0, 4294967295
+  %u0_hi = lshr i64 %u0, 32
+  %u1 = add nuw i64 %u0_lo, %t2
+  %u1_hi = lshr i64 %u1, 32
+  %u2 = add nuw i64 %u0_hi, %t3
+  call void (...) @llvm.fake.use(i64 %u2)
+  %hw64 = add nuw i64 %u2, %u1_hi
+  ret i64 %hw64
+}
+
+; 'x_lo' must have exactly 2 uses.
+define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__x_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  call void (...) @llvm.fake.use(i64 %x_lo)
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_lo' must have exactly 2 uses.
+define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  call void (...) @llvm.fake.use(i64 %y_lo)
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'x_hi' must have exactly 2 uses.
+define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  call void (...) @llvm.fake.use(i64 %x_hi)
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_hi' must have exactly 2 uses.
+define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+  call void (...) @llvm.fake.use(i64 %y_hi)
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_lo_x_hi' must have exactly 2 uses.
+define void @full_mul_int128__mul_use__y_lo_x_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_hi_x_hi' must have single use. High pattern doesn't apply but the low
+; pattern does, so there's still i128.
+define void @full_mul_int128__mul_use__y_hi_x_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]])
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_hi_x_lo' must have single use.
+define void @full_mul_int128__mul_use__y_hi_x_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_lo_x_lo' must have exactly 2 uses for full multiply.
+define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+  call void (...) @llvm.fake.use(i64 %y_lo_x_lo)
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'cross_sum' must have no more than 3 uses.
+define void @full_mul_int128__mul_use__cross_sum(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+  call void (...) @llvm.fake.use(i64 %cross_sum)
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'carry_out' must have single use.
+; High pattern doesn't apply, but low pattern does so there's i128.
+define void @full_mul_int128__mul_use__carry_out(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__carry_out(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i1 [[CARRY_OUT]])
+; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  call void (...) @llvm.fake.use(i1 %carry_out)
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'carry' must have single use.
+; High pattern doesn't apply, but low pattern does so there's i128.
+define void @full_mul_int128__mul_use__carry(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__carry(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CARRY]])
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+  call void (...) @llvm.fake.use(i64 %carry)
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_lo_x_lo_hi' must have single use.
+define void @full_mul_int128__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_hi(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+  call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'cross_sum_lo' must have single use.
+define void @full_mul_int128__mul_use__cross_sum_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_lo(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  call void (...) @llvm.fake.use(i64 %cross_sum_lo)
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'cross_sum_hi' must have single use.
+; High pattern doesn't apply, but low pattern does so there's i128.
+define void @full_mul_int128__mul_use__cross_sum_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]])
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+  call void (...) @llvm.fake.use(i64 %cross_sum_hi)
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'low_accum' must have exactly 2 uses if doing high multiply.
+define void @full_mul_int128__mul_use__low_accum(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum(
+; CHECK-NOT: i128
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+  call void (...) @llvm.fake.use(i64 %low_accum)
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'upper_mid' must have single use.
+; High pattern doesn't apply, but low pattern does so there's i128.
+define void @full_mul_int128__mul_use__upper_mid(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[UPPER_MID]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i128 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i64
+; CHECK-NEXT:    store i64 [[TMP9]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  call void (...) @llvm.fake.use(i64 %upper_mid)
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'low_accum_hi' must have single use.
+; High pattern doesn't apply, but low pattern does so there's i128.
+define void @full_mul_int128__mul_use__low_accum_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
+; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  call void (...) @llvm.fake.use(i64 %low_accum_hi)
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'upper_mid_with_cross' must have single use.
+; High pattern doesn't apply, but low pattern does so there's i128.
+define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid_with_cross(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT:    [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
+; CHECK-NEXT:    [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  call void (...) @llvm.fake.use(i64 %low_accum_hi)
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'low_accum_shifted' must have single use.
+; Low pattern doesn't apply, but high pattern does so there's i128.
+define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]])
+; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  call void (...) @llvm.fake.use(i64 %low_accum_shifted)
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}
+
+; 'y_lo_x_lo_lo' must have single use.
+; Low pattern doesn't apply, but high pattern does so there's i128.
+define void @full_mul_int128__mul_use__y_lo_x_lo_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT:    [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT:    [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT:    [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT:    [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT:    [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT:    [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT:    [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT:    [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT:    [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT:    [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT:    [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT:    [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_LO]])
+; CHECK-NEXT:    [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT:    store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  %x_lo = and i64 %x, 4294967295
+  %y_lo = and i64 %y, 4294967295
+  %x_hi = lshr i64 %x, 32
+  %y_hi = lshr i64 %y, 32
+
+  %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+  %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+  %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+  %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+  %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+  %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+  %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+  %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+  %cross_sum_lo = and i64 %cross_sum, 4294967295
+  %cross_sum_hi = lshr i64 %cross_sum, 32
+
+  %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+  %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+  %low_accum_hi = lshr i64 %low_accum, 32
+  %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+  %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+  %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+  store i64 %hw64, ptr %hi_ptr, align 8
+
+  %low_accum_shifted = shl i64 %low_accum, 32
+  %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+  call void (...) @llvm.fake.use(i64 %y_lo_x_lo_lo)
+  %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+  store i64 %lw64, ptr %p, align 8
+
+  ret void
+}