From c33731ab3f1dc80410f4c4cd3452e12200c06029 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 17 Nov 2025 11:29:29 +0000 Subject: [PATCH 1/5] [AggresiveInstCombine] Add various tests for high-multiply --- .../AggressiveInstCombine/umulh_carry.ll | 825 +++++ .../AggressiveInstCombine/umulh_carry4.ll | 3227 +++++++++++++++++ .../AggressiveInstCombine/umulh_ladder.ll | 904 +++++ .../AggressiveInstCombine/umulh_ladder4.ll | 600 +++ 4 files changed, 5556 insertions(+) create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll new file mode 100644 index 0000000000000..b9801370028cc --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll @@ -0,0 +1,825 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o +define i32 @mul_carry(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o +define i128 @mul_carry_i128(i128 %x, i128 %y) { +; CHECK-LABEL: define i128 @mul_carry_i128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i128 [[X]], 64 +; CHECK-NEXT: [[AND:%.*]] = and i128 [[X]], 18446744073709551615 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i128 [[Y]], 64 +; CHECK-NEXT: [[AND2:%.*]] = and i128 [[Y]], 18446744073709551615 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i128 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i128 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i128 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i128 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i128 [[MUL4]], 64 +; CHECK-NEXT: [[ADD6:%.*]] = add i128 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i128 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i128 18446744073709551616, i128 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i128 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i128 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i128 [[ADD6]], 64 +; CHECK-NEXT: [[ADD11:%.*]] = add i128 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i128 [[ADD11]] +; +entry: + %shr = lshr i128 %x, 64 + %and = and i128 %x, u0xffffffffffffffff + %shr1 = lshr i128 %y, 64 + %and2 = and i128 %y, u0xffffffffffffffff + %mul = mul nuw i128 %shr, %and2 + %mul3 = mul nuw i128 %and, %shr1 + %add = add i128 %mul, %mul3 + %mul4 = mul nuw i128 %and, %and2 + %shr5 = lshr i128 %mul4, 64 + %add6 = add i128 %add, %shr5 + %cmp = icmp ult i128 %add6, %mul + %cond = select i1 %cmp, i128 u0x10000000000000000, i128 0 + %mul8 = mul nuw i128 %shr, %shr1 + %add9 = add nuw i128 %mul8, %cond + %shr10 = lshr i128 %add6, 64 + %add11 = add i128 %add9, %shr10 + ret i128 %add11 +} + +; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o +define <4 x i32> @mul_carry_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: define <4 x i32> @mul_carry_v4i32( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 16) +; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[X]], splat (i32 65535) +; CHECK-NEXT: [[SHR1:%.*]] = lshr <4 x i32> [[Y]], splat (i32 16) +; CHECK-NEXT: [[AND2:%.*]] = and <4 x i32> [[Y]], splat (i32 65535) +; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw <4 x i32> [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw <4 x i32> [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr <4 x i32> [[MUL4]], splat (i32 16) +; CHECK-NEXT: [[ADD6:%.*]] = add <4 x i32> [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> splat (i32 65536), <4 x i32> zeroinitializer +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw <4 x i32> [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw <4 x i32> [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr <4 x i32> [[ADD6]], splat (i32 16) +; CHECK-NEXT: [[ADD11:%.*]] = add <4 x i32> [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret <4 x i32> [[ADD11]] +; +entry: + %shr = lshr <4 x i32> %x, + %and = and <4 x i32> %x, + %shr1 = lshr <4 x i32> %y, + %and2 = and <4 x i32> %y, + %mul = mul nuw <4 x i32> %shr, %and2 + %mul3 = mul nuw <4 x i32> %and, %shr1 + %add = add <4 x i32> %mul, %mul3 + %mul4 = mul nuw <4 x i32> %and, %and2 + %shr5 = lshr <4 x i32> %mul4, + %add6 = add <4 x i32> %add, %shr5 + %cmp = icmp ult <4 x i32> %add6, %mul + %cond = select <4 x i1> %cmp, <4 x i32> , <4 x i32> zeroinitializer + %mul8 = mul nuw <4 x i32> %shr, %shr1 + %add9 = add nuw <4 x i32> %mul8, %cond + %shr10 = lshr <4 x i32> %add6, + %add11 = add <4 x i32> %add9, %shr10 + ret <4 x i32> %add11 +} + +; Check carry against xlyh, not xhyl +define i32 @mul_carry_xlyh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_xlyh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL3]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul3 + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_comm(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_comm( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[AND2]], [[SHR]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[SHR1]], [[AND]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL3]], [[MUL]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[SHR5]], [[ADD]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i32 [[COND]], [[SHR10]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[MUL8]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %and2, %shr + %mul3 = mul nuw i32 %shr1, %and + %add = add i32 %mul3, %mul + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %shr5, %add + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %shr10 = lshr i32 %add6, 16 + %add9 = add nuw i32 %cond, %shr10 + %add11 = add i32 %add9, %mul8 + ret i32 %add11 +} + + +; Negative tests + + +define i32 @mul_carry_notxlo(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notxlo( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 32767 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw nsw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw nsw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 32767 ; wrong mask + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_notyhi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notyhi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 14 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 14 ; wring shift + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_notcarry(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notcarry( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 65536 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 0, i32 65536 ; backwards + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_notlolo(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_notlolo( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +define i32 @mul_carry_nothihi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_nothihi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL4]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul4, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + ret i32 %add11 +} + +; Extra uses +define i32 @mul_carry_use_carry(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_carry( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[COND]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %cond) + ret i32 %add11 +} + +define i32 @mul_carry_use_mulhi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mulhi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL8]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul8) + ret i32 %add11 +} + +define i32 @mul_carry_use_llh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_llh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD]], [[SHR10]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD7]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD7]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR11]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR10]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %shr5) + ret i32 %add11 +} + +define i32 @mul_carry_use_mulll(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mulll( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL4]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul4) + ret i32 %add11 +} + +define i32 @mul_carry_use_mullh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mullh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL3]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul3) + ret i32 %add11 +} + +define i32 @mul_carry_use_mulhl(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_mulhl( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %mul) + ret i32 %add11 +} + +define i32 @mul_carry_use_crosssum(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_crosssum( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[ADD9:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[SHR10:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD11]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[ADD9]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD10:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[ADD10]], [[SHR11]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD11]]) +; CHECK-NEXT: ret i32 [[TMP4]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %add) + ret i32 %add11 +} + +define i32 @mul_carry_use_lowaccumhi(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_lowaccumhi( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD]], [[SHR10]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD7]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD7]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR11]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR11]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %shr10) + ret i32 %add11 +} + +define i32 @mul_carry_use_lowaccum(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_carry_use_lowaccum( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] +; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] +; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] +; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 +; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] +; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] +; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD6]]) +; CHECK-NEXT: ret i32 [[ADD11]] +; +entry: + %shr = lshr i32 %x, 16 + %and = and i32 %x, 65535 + %shr1 = lshr i32 %y, 16 + %and2 = and i32 %y, 65535 + %mul = mul nuw i32 %shr, %and2 + %mul3 = mul nuw i32 %and, %shr1 + %add = add i32 %mul, %mul3 + %mul4 = mul nuw i32 %and, %and2 + %shr5 = lshr i32 %mul4, 16 + %add6 = add i32 %add, %shr5 + %cmp = icmp ult i32 %add6, %mul + %cond = select i1 %cmp, i32 65536, i32 0 + %mul8 = mul nuw i32 %shr, %shr1 + %add9 = add nuw i32 %mul8, %cond + %shr10 = lshr i32 %add6, 16 + %add11 = add i32 %add9, %shr10 + call void (...) @llvm.fake.use(i32 %add6) + ret i32 %add11 +} diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll new file mode 100644 index 0000000000000..d92434a7a7ea5 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll @@ -0,0 +1,3227 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; https://alive2.llvm.org/ce/z/KuJPnU +define i64 @umulh(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; Commutative ops should match in any order. Ops where operand order has been +; reversed from above are marked 'commuted'. As per instcombine contributors +; guide, constants are always canonicalized to RHS, so don't bother commuting +; constants. +define i64 @umulh__commuted(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__commuted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_LO_X_HI]], [[Y_HI_X_LO]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[Y_LO_X_LO_HI]], [[CROSS_SUM_LO]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[CARRY]], [[INTERMEDIATE]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[LOW_ACCUM_HI]], [[INTERMEDIATE_PLUS_CARRY]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %x_hi, %y_lo ; commuted + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %x_lo, %y_hi ; commuted + %y_lo_x_lo = mul nuw i64 %x_lo, %y_lo ; commuted + + ; Add cross terms + %cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted + + ; Final result accumulation + %intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %carry, %intermediate ; commuted + %hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted + + ret i64 %hw64 +} + +define i32 @mulh_src32(i32 %x, i32 %y) { + ; Extract low and high 16 bits +; CHECK-LABEL: define i32 @mulh_src32( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i32 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i32 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i32 65536, i32 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i32 [[Y_LO_X_LO]], 16 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i32 [[CROSS_SUM]], 65535 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i32 [[CROSS_SUM]], 16 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i32 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i32 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i32 [[LOW_ACCUM]], 16 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i32 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i32 [[TMP5]] +; + %x_lo = and i32 %x, u0xffff ; x & 0xffffffff + %y_lo = and i32 %y, u0xffff ; y & 0xffffffff + %x_hi = lshr i32 %x, 16 ; x >> 16 + %y_hi = lshr i32 %y, 16 ; y >> 16 + + ; Cross products + %y_lo_x_hi = mul nuw i32 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i32 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i32 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i32 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i32 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i32 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i32 u0x10000, i32 0 ; if overflow, add 1 << 16 + + ; High 16 bits of low product + %y_lo_x_lo_hi = lshr i32 %y_lo_x_lo, 16 + + ; Low and high 16 bits of cross_sum + %cross_sum_lo = and i32 %cross_sum, u0xffff + %cross_sum_hi = lshr i32 %cross_sum, 16 + + %low_accum = add nuw nsw i32 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i32 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i32 %low_accum, 16 + %intermediate_plus_carry = add i32 %intermediate, %carry + %hw64 = add i32 %intermediate_plus_carry, %low_accum_hi + + ret i32 %hw64 +} + +define i128 @mulh_src128(i128 %x, i128 %y) { + ; Extract low and high 64 bits +; CHECK-LABEL: define i128 @mulh_src128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615 +; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i128 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i128 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i128 18446744073709551616, i128 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i128 [[Y_LO_X_LO]], 64 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i128 [[CROSS_SUM]], 18446744073709551615 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i128 [[CROSS_SUM]], 64 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i128 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i128 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i128 [[LOW_ACCUM]], 64 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i128 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i128 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i128 [[HW64]] +; + %x_lo = and i128 %x, u0xffffffffffffffff ; x & 0xffffffff + %y_lo = and i128 %y, u0xffffffffffffffff ; y & 0xffffffff + %x_hi = lshr i128 %x, 64 ; x >> 16 + %y_hi = lshr i128 %y, 64 ; y >> 16 + + ; Cross products + %y_lo_x_hi = mul nuw i128 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i128 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i128 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i128 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i128 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i128 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i128 u0x10000000000000000, i128 0 ; if overflow, add 1 << 16 + + ; High 16 bits of low product + %y_lo_x_lo_hi = lshr i128 %y_lo_x_lo, 64 + + ; Low and high 16 bits of cross_sum + %cross_sum_lo = and i128 %cross_sum, u0xffffffffffffffff + %cross_sum_hi = lshr i128 %cross_sum, 64 + + %low_accum = add nuw nsw i128 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i128 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i128 %low_accum, 64 + %intermediate_plus_carry = add i128 %intermediate, %carry + %hw64 = add i128 %intermediate_plus_carry, %low_accum_hi + + ret i128 %hw64 +} + +define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) { + ; Extract low and high 16 bits +; CHECK-LABEL: define <2 x i32> @mulh_v2i32( +; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535) +; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535) +; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16) +; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16) +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add <2 x i32> [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult <2 x i32> [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select <2 x i1> [[CARRY_OUT]], <2 x i32> splat (i32 65536), <2 x i32> zeroinitializer +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr <2 x i32> [[Y_LO_X_LO]], splat (i32 16) +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and <2 x i32> [[CROSS_SUM]], splat (i32 65535) +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr <2 x i32> [[CROSS_SUM]], splat (i32 16) +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw <2 x i32> [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw <2 x i32> [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr <2 x i32> [[LOW_ACCUM]], splat (i32 16) +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add <2 x i32> [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add <2 x i32> [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret <2 x i32> [[HW64]] +; + %x_lo = and <2 x i32> %x, + %y_lo = and <2 x i32> %y, + %x_hi = lshr <2 x i32> %x, + %y_hi = lshr <2 x i32> %y, + + ; Cross products + %y_lo_x_hi = mul nuw <2 x i32> %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw <2 x i32> %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw <2 x i32> %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw <2 x i32> %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add <2 x i32> %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult <2 x i32> %cross_sum, %y_lo_x_hi + %carry = select <2 x i1> %carry_out, <2 x i32> , <2 x i32> + + ; High 16 bits of low product + %y_lo_x_lo_hi = lshr <2 x i32> %y_lo_x_lo, + + ; Low and high 16 bits of cross_sum + %cross_sum_lo = and <2 x i32> %cross_sum, + %cross_sum_hi = lshr <2 x i32> %cross_sum, + + %low_accum = add nuw nsw <2 x i32> %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw <2 x i32> %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr <2 x i32> %low_accum, + %intermediate_plus_carry = add <2 x i32> %intermediate, %carry + %hw64 = add <2 x i32> %intermediate_plus_carry, %low_accum_hi + + ret <2 x i32> %hw64 +} + +; https://alive2.llvm.org/ce/z/PPXtkR +define void @full_mul_int128(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + ; Store high 64 bits + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + ; Reconstruct low 64 bits + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + ; Store low 64 bits + store i64 %lw64, ptr %p, align 8 + + ret void +} + + +; Negative tests + +define i64 @umulh_notandx(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notandx( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967294 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967294 ; x & 0xfffffffe + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notandy(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notandy( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967294 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967294 ; y & 0xfffffffe + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notshiftx(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notshiftx( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 16 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 16 ; x >> 16 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notshifty(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notshifty( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 16 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 16 ; y >> 16 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notcarry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notcarry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967295, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967295, i64 0 ; if overflow, add wrong value + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notxlo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notxlo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x ; y_lo * x + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +define i64 @umulh_notcrosssum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_notcrosssum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = shl i64 [[Y_HI_X_LO]], 1 +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967294 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_hi_x_lo ; wrong crosssum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + + + +; Uses tests. + +; 'x_lo' can have more than 2 uses. +define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]]) +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + call void (...) @llvm.fake.use(i64 %x_lo) + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi' can have more than 2 uses. +define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]]) +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + call void (...) @llvm.fake.use(i64 %y_hi) + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_hi' must have no more than 2 uses. +define i64 @umulh__mul_use__y_lo_x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + call void (...) @llvm.fake.use(i64 %y_lo_x_hi) + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_hi' must have single use. +define i64 @umulh__mul_use__y_hi_x_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + call void (...) @llvm.fake.use(i64 %y_hi_x_hi) + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_hi * x_lo' must have single use. +define i64 @umulh__mul_use__y_hi_x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]]) +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + call void (...) @llvm.fake.use(i64 %y_hi_x_lo) + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo * x_lo' has a single use if only doing high part of multiply and 2 uses +; when doing both low/high parts. Doing the optimization when only doing the +; high part and there's a 2nd unrelated use here still results in less +; instructions and is likely profitable, so this seems ok. +define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]]) +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[TMP5]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + call void (...) @llvm.fake.use(i64 %y_lo_x_lo) + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum' must have no more than 3 uses. +define i64 @umulh__mul_use__cross_sum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]]) +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + call void (...) @llvm.fake.use(i64 %cross_sum) + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'carry_out' must have single use. +define i64 @umulh__mul_use__carry_out(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__carry_out( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]]) +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + call void (...) @llvm.fake.use(i1 %carry_out) + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'carry' must have single use. +define i64 @umulh__mul_use__carry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__carry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]]) +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + call void (...) @llvm.fake.use(i64 %carry) + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'y_lo_x_lo_hi' must have single use. +define i64 @umulh__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]]) +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi) + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum_lo' must have single use. +define i64 @umulh__mul_use__cross_sum_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]]) +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + call void (...) @llvm.fake.use(i64 %cross_sum_lo) + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'cross_sum_hi' must have single use. +define i64 @umulh__mul_use__cross_sum_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]]) +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + call void (...) @llvm.fake.use(i64 %cross_sum_hi) + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'low_accum' has a single use if only doing high part of multiply and 2 uses +; when doing both low/high parts. Unrelated use here, but still seems +; profitable. +define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__low_accum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]]) +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[TMP5]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + call void (...) @llvm.fake.use(i64 %low_accum) + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'intermediate' must have single use. +define i64 @umulh__mul_use__intermediate(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__intermediate( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE]]) +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + call void (...) @llvm.fake.use(i64 %intermediate) + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'low_accum_hi' must have single use. +define i64 @umulh__mul_use__low_accum_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__low_accum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + + ret i64 %hw64 +} + +; 'intermediate_plus_carry' must have single use. +define i64 @umulh__mul_use__intermediate_plus_carry(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh__mul_use__intermediate_plus_carry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE_PLUS_CARRY]]) +; CHECK-NEXT: ret i64 [[HW64]] +; + ; Extract low and high 32 bits + %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff + %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff + %x_hi = lshr i64 %x, 32 ; x >> 32 + %y_hi = lshr i64 %y, 32 ; y >> 32 + + ; Cross products + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo + + ; Add cross terms + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum + + ; Carry if overflowed + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32 + + ; High 32 bits of low product + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + ; Low and high 32 bits of cross_sum + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + ; Final result accumulation + %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi + %low_accum_hi = lshr i64 %low_accum, 32 + %intermediate_plus_carry = add i64 %intermediate, %carry + %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi + call void (...) @llvm.fake.use(i64 %intermediate_plus_carry) + + ret i64 %hw64 +} + + +; 'x_lo' can have multiple uses. +define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]]) +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + call void (...) @llvm.fake.use(i64 %x_lo) + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo' can have multiple uses. +define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO]]) +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + call void (...) @llvm.fake.use(i64 %y_lo) + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'x_hi' can have multiple uses. +define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_HI]]) +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + call void (...) @llvm.fake.use(i64 %x_hi) + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi' can have multiple uses. +define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]]) +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + call void (...) @llvm.fake.use(i64 %y_hi) + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_hi' must have exactly 2 uses. +define void @full_mul_int128__mul_use__y_lo_x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + call void (...) @llvm.fake.use(i64 %y_lo_x_hi) + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi_x_hi' must have single use. +define void @full_mul_int128__mul_use__y_hi_x_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]]) +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + call void (...) @llvm.fake.use(i64 %y_hi_x_hi) + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_hi_x_lo' must have single use. +define void @full_mul_int128__mul_use__y_hi_x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]]) +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + call void (...) @llvm.fake.use(i64 %y_hi_x_lo) + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_lo' we allow multiple uses on y_lo_x_lo. +; TODO does not simplify like it should? +define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]]) +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[TMP6]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[TMP6]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[TMP6]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI1:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS1:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS1]], [[LOW_ACCUM_HI1]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + call void (...) @llvm.fake.use(i64 %y_lo_x_lo) + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum' must have no more than 3 uses. +define void @full_mul_int128__mul_use__cross_sum(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]]) +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + call void (...) @llvm.fake.use(i64 %cross_sum) + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'carry_out' must have single use. +define void @full_mul_int128__mul_use__carry_out(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__carry_out( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]]) +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + call void (...) @llvm.fake.use(i1 %carry_out) + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'carry' must have single use. +define void @full_mul_int128__mul_use__carry(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__carry( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]]) +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + call void (...) @llvm.fake.use(i64 %carry) + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'y_lo_x_lo_hi' must have single use. +define void @full_mul_int128__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]]) +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi) + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum_lo' must have single use. +define void @full_mul_int128__mul_use__cross_sum_lo(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]]) +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + call void (...) @llvm.fake.use(i64 %cross_sum_lo) + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'cross_sum_hi' must have single use. +define void @full_mul_int128__mul_use__cross_sum_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]]) +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + call void (...) @llvm.fake.use(i64 %cross_sum_hi) + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum' must have exactly 2 uses if doing high multiply. +define void @full_mul_int128__mul_use__low_accum(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]]) +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + call void (...) @llvm.fake.use(i64 %low_accum) + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'upper_mid' must have single use. +define void @full_mul_int128__mul_use__upper_mid(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[UPPER_MID]]) +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP9]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + call void (...) @llvm.fake.use(i64 %upper_mid) + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum_hi' must have single use. +define void @full_mul_int128__mul_use__low_accum_hi(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'upper_mid_with_cross' must have single use. +define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid_with_cross( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]]) +; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + call void (...) @llvm.fake.use(i64 %low_accum_hi) + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + +; 'low_accum_shifted' can have multiple uses. +define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) { +; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] +; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 +; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 +; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 +; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 +; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] +; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]]) +; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 +; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 +; CHECK-NEXT: ret void +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi + %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi + %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo + %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo + + %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi + + %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi + %carry = select i1 %carry_out, i64 4294967296, i64 0 + + %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32 + + %cross_sum_lo = and i64 %cross_sum, 4294967295 + %cross_sum_hi = lshr i64 %cross_sum, 32 + + %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi + + %upper_mid = add nuw i64 %y_hi_x_hi, %carry + %low_accum_hi = lshr i64 %low_accum, 32 + %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi + %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi + + %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8 + store i64 %hw64, ptr %hi_ptr, align 8 + + %low_accum_shifted = shl i64 %low_accum, 32 + call void (...) @llvm.fake.use(i64 %low_accum_shifted) + %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295 + %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo + + store i64 %lw64, ptr %p, align 8 + + ret void +} + diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll new file mode 100644 index 0000000000000..6e56eb86516c5 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll @@ -0,0 +1,904 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; https://alive2.llvm.org/ce/z/MSo5S_ +define i64 @umulh_variant(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[TMP5:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[TMP5]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i32 @umulh_variant_i32(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @umulh_variant_i32( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i32 [[T0]], 16 +; CHECK-NEXT: [[U0:%.*]] = add nuw i32 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i32 [[U0]], 65535 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i32 [[U0]], 16 +; CHECK-NEXT: [[U1:%.*]] = add nuw i32 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i32 [[U1]], 16 +; CHECK-NEXT: [[U2:%.*]] = add nuw i32 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i32 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i32 [[HW64]] +; + %x_lo = and i32 %x, u0xffff + %y_lo = and i32 %y, u0xffff + %x_hi = lshr i32 %x, 16 + %y_hi = lshr i32 %y, 16 + + %t0 = mul nuw i32 %y_lo, %x_lo + %t1 = mul nuw i32 %y_lo, %x_hi + %t2 = mul nuw i32 %y_hi, %x_lo + %t3 = mul nuw i32 %y_hi, %x_hi + + %t0_hi = lshr i32 %t0, 16 + + %u0 = add nuw i32 %t0_hi, %t1 + %u0_lo = and i32 %u0, u0xffff + %u0_hi = lshr i32 %u0, 16 + %u1 = add nuw i32 %u0_lo, %t2 + %u1_hi = lshr i32 %u1, 16 + %u2 = add nuw i32 %u0_hi, %t3 + %hw64 = add nuw i32 %u2, %u1_hi + ret i32 %hw64 +} + +define <2 x i32> @umulh_variant_v2i32(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: define <2 x i32> @umulh_variant_v2i32( +; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535) +; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535) +; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16) +; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16) +; CHECK-NEXT: [[T0:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr <2 x i32> [[T0]], splat (i32 16) +; CHECK-NEXT: [[U0:%.*]] = add nuw <2 x i32> [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and <2 x i32> [[U0]], splat (i32 65535) +; CHECK-NEXT: [[U0_HI:%.*]] = lshr <2 x i32> [[U0]], splat (i32 16) +; CHECK-NEXT: [[U1:%.*]] = add nuw <2 x i32> [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr <2 x i32> [[U1]], splat (i32 16) +; CHECK-NEXT: [[U2:%.*]] = add nuw <2 x i32> [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw <2 x i32> [[U2]], [[U1_HI]] +; CHECK-NEXT: ret <2 x i32> [[HW64]] +; + %x_lo = and <2 x i32> %x, + %y_lo = and <2 x i32> %y, + %x_hi = lshr <2 x i32> %x, + %y_hi = lshr <2 x i32> %y, + + %t0 = mul nuw <2 x i32> %y_lo, %x_lo + %t1 = mul nuw <2 x i32> %y_lo, %x_hi + %t2 = mul nuw <2 x i32> %y_hi, %x_lo + %t3 = mul nuw <2 x i32> %y_hi, %x_hi + + %t0_hi = lshr <2 x i32> %t0, + + %u0 = add nuw <2 x i32> %t0_hi, %t1 + %u0_lo = and <2 x i32> %u0, + %u0_hi = lshr <2 x i32> %u0, + %u1 = add nuw <2 x i32> %u0_lo, %t2 + %u1_hi = lshr <2 x i32> %u1, + %u2 = add nuw <2 x i32> %u0_hi, %t3 + %hw64 = add nuw <2 x i32> %u2, %u1_hi + ret <2 x i32> %hw64 +} + +define i128 @umulh_variant_i128(i128 %x, i128 %y) { +; CHECK-LABEL: define i128 @umulh_variant_i128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615 +; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i128 [[T0]], 64 +; CHECK-NEXT: [[U0:%.*]] = add nuw i128 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i128 [[U0]], 18446744073709551615 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i128 [[U0]], 64 +; CHECK-NEXT: [[U1:%.*]] = add nuw i128 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i128 [[U1]], 64 +; CHECK-NEXT: [[U2:%.*]] = add nuw i128 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i128 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i128 [[HW64]] +; + %x_lo = and i128 %x, u0xffffffffffffffff + %y_lo = and i128 %y, u0xffffffffffffffff + %x_hi = lshr i128 %x, 64 + %y_hi = lshr i128 %y, 64 + + %t0 = mul nuw i128 %y_lo, %x_lo + %t1 = mul nuw i128 %y_lo, %x_hi + %t2 = mul nuw i128 %y_hi, %x_lo + %t3 = mul nuw i128 %y_hi, %x_hi + + %t0_hi = lshr i128 %t0, 64 + + %u0 = add nuw i128 %t0_hi, %t1 + %u0_lo = and i128 %u0, u0xffffffffffffffff + %u0_hi = lshr i128 %u0, 64 + %u1 = add nuw i128 %u0_lo, %t2 + %u1_hi = lshr i128 %u1, 64 + %u2 = add nuw i128 %u0_hi, %t3 + %hw64 = add nuw i128 %u2, %u1_hi + ret i128 %hw64 +} + +define i64 @umulh_variant_commuted(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_commuted( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[X_HI]], [[Y_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T1]], [[T0_HI]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[T2]], [[U0_LO]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw nsw i64 [[U1_HI]], [[U0_HI]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[T3]], [[U2]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %x_lo, %y_lo + %t1 = mul nuw i64 %x_lo, %y_hi + %t2 = mul nuw i64 %x_hi, %y_lo + %t3 = mul nuw i64 %x_hi, %y_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t1, %t0_hi + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %t2, %u0_lo + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u1_hi, %u0_hi + %hw64 = add nuw i64 %t3, %u2 + ret i64 %hw64 +} + + + +; Negative tests + +define i64 @umulh_variant_notlox(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_notlox( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967294 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967294 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967294 ; wrong imm + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i64 @umulh_variant_nothiy(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_nothiy( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 16 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 16 ; wrong imm + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i64 @umulh_variant_notlowacc(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_notlowacc( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967294 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967294 ; wrong imm + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +define i64 @umulh_variant_notll(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant_notll( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t3 = mul nuw i64 %y_lo, %x_lo ; swapped lolo and hihi + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t0 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + + + +; Use checks + +; 't0' can have more than one use. +define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + call void (...) @llvm.fake.use(i64 %t0) + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't1' can have more than one use. +define i64 @umulh_variant__mul_use__t1(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t1( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + call void (...) @llvm.fake.use(i64 %t1) + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't2' can have more than one use. +define i64 @umulh_variant__mul_use__t2(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t2( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T2]]) +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + call void (...) @llvm.fake.use(i64 %t2) + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't3' must have single use. +define i64 @umulh_variant__mul_use__t3(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t3( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T3]]) +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + call void (...) @llvm.fake.use(i64 %t3) + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 't0_hi' must have single use. +define i64 @umulh_variant__mul_use__t0_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0_HI]]) +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + call void (...) @llvm.fake.use(i64 %t0_hi) + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0' must have single use. +define i64 @umulh_variant__mul_use__u0(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0]]) +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + call void (...) @llvm.fake.use(i64 %u0) + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0_lo' must have single use. +define i64 @umulh_variant__mul_use__u0_lo(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_lo( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0_LO]]) +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + call void (...) @llvm.fake.use(i64 %u0_lo) + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u0_hi' must have single use. +define i64 @umulh_variant__mul_use__u0_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0_HI]]) +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + call void (...) @llvm.fake.use(i64 %u0_hi) + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u1' must have single use. +define i64 @umulh_variant__mul_use__u1(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U1]]) +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + call void (...) @llvm.fake.use(i64 %u1) + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u1_hi' must have single use. +define i64 @umulh_variant__mul_use__u1_hi(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1_hi( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 +; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U1_HI]]) +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + call void (...) @llvm.fake.use(i64 %u1_hi) + %u2 = add nuw i64 %u0_hi, %t3 + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} + +; 'u2' must have single use. +define i64 @umulh_variant__mul_use__u2(i64 %x, i64 %y) { +; CHECK-LABEL: define i64 @umulh_variant__mul_use__u2( +; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { +; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 +; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 +; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 +; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 +; CHECK-NEXT: [[U0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] +; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] +; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] +; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 +; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_HI]], [[T1]] +; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U1]], 4294967295 +; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 +; CHECK-NEXT: [[U3:%.*]] = add nuw i64 [[U0_LO]], [[T2]] +; CHECK-NEXT: [[U1_HI1:%.*]] = lshr i64 [[U3]], 32 +; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U1_HI]], [[T3]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U2]]) +; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI1]] +; CHECK-NEXT: ret i64 [[HW64]] +; + %x_lo = and i64 %x, 4294967295 + %y_lo = and i64 %y, 4294967295 + %x_hi = lshr i64 %x, 32 + %y_hi = lshr i64 %y, 32 + + %t0 = mul nuw i64 %y_lo, %x_lo + %t1 = mul nuw i64 %y_lo, %x_hi + %t2 = mul nuw i64 %y_hi, %x_lo + %t3 = mul nuw i64 %y_hi, %x_hi + + %t0_hi = lshr i64 %t0, 32 + + %u0 = add nuw i64 %t0_hi, %t1 + %u0_lo = and i64 %u0, 4294967295 + %u0_hi = lshr i64 %u0, 32 + %u1 = add nuw i64 %u0_lo, %t2 + %u1_hi = lshr i64 %u1, 32 + %u2 = add nuw i64 %u0_hi, %t3 + call void (...) @llvm.fake.use(i64 %u2) + %hw64 = add nuw i64 %u2, %u1_hi + ret i64 %hw64 +} diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll new file mode 100644 index 0000000000000..5f84bc4e93b82 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll @@ -0,0 +1,600 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s + +; Ladder4 variant. https://alive2.llvm.org/ce/z/tExFRs +define i32 @mul_ladder4(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]] +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]] +; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]] +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %xl, %yl + %mullh = mul nuw i32 %xl, %yh + %mulhl = mul nuw i32 %xh, %yl + %mulhh = mul nuw i32 %xh, %yh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %shr8, %conv10 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %add, %conv12 + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %add16 = add nuw i32 %mulhh, %shr15 + %shr17 = lshr i32 %mulhl, 16 + %add18 = add nuw i32 %add16, %shr17 + %add19 = add nuw i32 %add18, %shr14 + ret i32 %add19 +} + +define <2 x i32> @mul_ladder4_v2i32(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: define <2 x i32> @mul_ladder4_v2i32( +; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and <2 x i32> [[X]], splat (i32 65535) +; CHECK-NEXT: [[XH:%.*]] = lshr <2 x i32> [[X]], splat (i32 16) +; CHECK-NEXT: [[YL:%.*]] = and <2 x i32> [[Y]], splat (i32 65535) +; CHECK-NEXT: [[YH:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16) +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw <2 x i32> [[XL]], [[YL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw <2 x i32> [[XL]], [[YH]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw <2 x i32> [[XH]], [[YL]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw <2 x i32> [[XH]], [[YH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr <2 x i32> [[MULLL]], splat (i32 16) +; CHECK-NEXT: [[CONV10:%.*]] = and <2 x i32> [[MULLH]], splat (i32 65535) +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i32> [[SHR8]], [[CONV10]] +; CHECK-NEXT: [[CONV12:%.*]] = and <2 x i32> [[MULHL]], splat (i32 65535) +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw <2 x i32> [[ADD]], [[CONV12]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr <2 x i32> [[ADD13]], splat (i32 16) +; CHECK-NEXT: [[SHR15:%.*]] = lshr <2 x i32> [[MULLH]], splat (i32 16) +; CHECK-NEXT: [[ADD16:%.*]] = add nuw <2 x i32> [[MULHH]], [[SHR15]] +; CHECK-NEXT: [[SHR17:%.*]] = lshr <2 x i32> [[MULHL]], splat (i32 16) +; CHECK-NEXT: [[ADD18:%.*]] = add nuw <2 x i32> [[ADD16]], [[SHR17]] +; CHECK-NEXT: [[ADD19:%.*]] = add nuw <2 x i32> [[ADD18]], [[SHR14]] +; CHECK-NEXT: ret <2 x i32> [[ADD19]] +; +entry: + %xl = and <2 x i32> %x, + %xh = lshr <2 x i32> %x, + %yl = and <2 x i32> %y, + %yh = lshr <2 x i32> %y, + %mulll = mul nuw <2 x i32> %xl, %yl + %mullh = mul nuw <2 x i32> %xl, %yh + %mulhl = mul nuw <2 x i32> %xh, %yl + %mulhh = mul nuw <2 x i32> %xh, %yh + %shr8 = lshr <2 x i32> %mulll, + %conv10 = and <2 x i32> %mullh, + %add = add nuw nsw <2 x i32> %shr8, %conv10 + %conv12 = and <2 x i32> %mulhl, + %add13 = add nuw nsw <2 x i32> %add, %conv12 + %shr14 = lshr <2 x i32> %add13, + %shr15 = lshr <2 x i32> %mullh, + %add16 = add nuw <2 x i32> %mulhh, %shr15 + %shr17 = lshr <2 x i32> %mulhl, + %add18 = add nuw <2 x i32> %add16, %shr17 + %add19 = add nuw <2 x i32> %add18, %shr14 + ret <2 x i32> %add19 +} + +define i128 @mul_ladder4_i128(i128 %x, i128 %y) { +; CHECK-LABEL: define i128 @mul_ladder4_i128( +; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i128 [[X]], 18446744073709551615 +; CHECK-NEXT: [[XH:%.*]] = lshr i128 [[X]], 64 +; CHECK-NEXT: [[YL:%.*]] = and i128 [[Y]], 18446744073709551615 +; CHECK-NEXT: [[YH:%.*]] = lshr i128 [[Y]], 64 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i128 [[XL]], [[YL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i128 [[XL]], [[YH]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i128 [[XH]], [[YL]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i128 [[XH]], [[YH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i128 [[MULLL]], 64 +; CHECK-NEXT: [[CONV10:%.*]] = and i128 [[MULLH]], 18446744073709551615 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i128 [[SHR8]], [[CONV10]] +; CHECK-NEXT: [[CONV12:%.*]] = and i128 [[MULHL]], 18446744073709551615 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i128 [[ADD]], [[CONV12]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i128 [[ADD13]], 64 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i128 [[MULLH]], 64 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw i128 [[MULHH]], [[SHR15]] +; CHECK-NEXT: [[SHR17:%.*]] = lshr i128 [[MULHL]], 64 +; CHECK-NEXT: [[ADD18:%.*]] = add nuw i128 [[ADD16]], [[SHR17]] +; CHECK-NEXT: [[ADD19:%.*]] = add nuw i128 [[ADD18]], [[SHR14]] +; CHECK-NEXT: ret i128 [[ADD19]] +; +entry: + %xl = and i128 %x, u0xffffffffffffffff + %xh = lshr i128 %x, 64 + %yl = and i128 %y, u0xffffffffffffffff + %yh = lshr i128 %y, 64 + %mulll = mul nuw i128 %xl, %yl + %mullh = mul nuw i128 %xl, %yh + %mulhl = mul nuw i128 %xh, %yl + %mulhh = mul nuw i128 %xh, %yh + %shr8 = lshr i128 %mulll, 64 + %conv10 = and i128 %mullh, u0xffffffffffffffff + %add = add nuw nsw i128 %shr8, %conv10 + %conv12 = and i128 %mulhl, u0xffffffffffffffff + %add13 = add nuw nsw i128 %add, %conv12 + %shr14 = lshr i128 %add13, 64 + %shr15 = lshr i128 %mullh, 64 + %add16 = add nuw i128 %mulhh, %shr15 + %shr17 = lshr i128 %mulhl, 64 + %add18 = add nuw i128 %add16, %shr17 + %add19 = add nuw i128 %add18, %shr14 + ret i128 %add19 +} + +define i32 @mul_ladder4_commutted(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_commutted( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add nuw i32 %shr14, %shr17 + %add18 = add nuw i32 %add16, %shr15 + %add19 = add nuw i32 %mulhh, %add18 + ret i32 %add19 +} + +define i32 @mul_ladder4_swap_hl_lh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_swap_hl_lh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]] +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]] +; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]] +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %xl, %yl + %mullh = mul nuw i32 %xl, %yh + %mulhl = mul nuw i32 %xh, %yl + %mulhh = mul nuw i32 %xh, %yh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mulhl, 65535 + %add = add nuw nsw i32 %shr8, %conv10 + %conv12 = and i32 %mullh, 65535 + %add13 = add nuw nsw i32 %add, %conv12 + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mulhl, 16 + %add16 = add nuw i32 %mulhh, %shr15 + %shr17 = lshr i32 %mullh, 16 + %add18 = add nuw i32 %add16, %shr17 + %add19 = add nuw i32 %add18, %shr14 + ret i32 %add19 +} + + +; Negative tests + +define i32 @mul_ladder4_notlhhl(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_notlhhl( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]] +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]] +; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]] +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %xl, %yl + %mullh = mul nuw i32 %xl, %yh + %mulhl = mul nuw i32 %xh, %yl + %mulhh = mul nuw i32 %xh, %yh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mulhl, 65535 + %add = add nuw nsw i32 %shr8, %conv10 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %add, %conv12 + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mulhl, 16 + %add16 = add nuw i32 %mulhh, %shr15 + %shr17 = lshr i32 %mulhl, 16 + %add18 = add nuw i32 %add16, %shr17 + %add19 = add nuw i32 %add18, %shr14 + ret i32 %add19 +} + + + + + + +; Extra uses + +define i32 @mul_ladder4_use_add13(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_add13( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD13]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %add13) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_conv12(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_conv12( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV12]], [[SHR8]] +; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV13]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL1]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[CONV13]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %conv12) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_u0(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_u0( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV13]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL1]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %add) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_hl(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_hl( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MULHL]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %mulhl) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_lh(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_lh( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MULLH]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %mullh) + ret i32 %add19 +} + +define i32 @mul_ladder4_use_conv10(i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @mul_ladder4_use_conv10( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 +; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 +; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 +; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 +; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] +; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YH]], [[XL]] +; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YL]], [[XH]] +; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] +; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 +; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV12]], [[SHR8]] +; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535 +; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV13]], [[ADD]] +; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 +; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 +; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL1]], 16 +; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] +; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] +; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[CONV12]]) +; CHECK-NEXT: ret i32 [[ADD19]] +; +entry: + %xl = and i32 %x, 65535 + %xh = lshr i32 %x, 16 + %yl = and i32 %y, 65535 + %yh = lshr i32 %y, 16 + %mulll = mul nuw i32 %yl, %xl + %mullh = mul nuw i32 %yh, %xl + %mulhl = mul nuw i32 %yl, %xh + %mulhh = mul nuw i32 %yh, %xh + %shr8 = lshr i32 %mulll, 16 + %conv10 = and i32 %mullh, 65535 + %add = add nuw nsw i32 %conv10, %shr8 + %conv12 = and i32 %mulhl, 65535 + %add13 = add nuw nsw i32 %conv12, %add + %shr14 = lshr i32 %add13, 16 + %shr15 = lshr i32 %mullh, 16 + %shr17 = lshr i32 %mulhl, 16 + %add16 = add i32 %shr14, %shr17 + %add18 = add i32 %add16, %shr15 + %add19 = add i32 %mulhh, %add18 + call void (...) @llvm.fake.use(i32 %conv10) + ret i32 %add19 +} From 3c4f412bbb239ffe2a4f96be7c86e68d84f89ce6 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 17 Nov 2025 16:11:48 +0000 Subject: [PATCH 2/5] [AggressiveInstCombine] Match long high-half multiply This patch adds recognition of high-half multiply by parts into a single larger multiply. Considering a multiply made up of high and low parts, we can split the multiply into: x * y == (xh*T + xl) * (yh*T + yl) where xh == x>>32 and xl == x & 0xffffffff. T = 2^32. This expands to xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl which I find it helpful to be drawn as [ xh*yh ] [ xh*yl ] [ xl*yh ] [ xl*yl ] We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 + carrys. The carry makes this difficult and there are multiple ways of representing it. The ones we attempt to support here are: Carry: xh*yh + carry + lowsum carry = lowsum < xh*yl ? 0x1000000 : 0 lowsum = xh*yl + xl*yh + (xl*yl>>32) Ladder: xh*yh + c2>>32 + c3>>32 c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32 crosssum = xh*yl + xl*yh carry = crosssum < xh*yl ? 0x1000000 : 0 Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32; low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff They all start by matching xh*yh + 2 or 3 other operands. The bottom of the tree is xh*yh, xh*yl, xl*yh and xl*yl. Based on #156879 by @c-rhodes --- .../AggressiveInstCombine.cpp | 301 +++++++++++++ .../AggressiveInstCombine/umulh_carry.ll | 140 ++----- .../AggressiveInstCombine/umulh_carry4.ll | 394 +++++------------- .../AggressiveInstCombine/umulh_ladder.ll | 176 ++------ .../AggressiveInstCombine/umulh_ladder4.ll | 120 ++---- 5 files changed, 499 insertions(+), 632 deletions(-) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index b575d76e897d2..fb71f57eaa502 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1466,6 +1466,306 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, return false; } +/// Match high part of long multiplication. +/// +/// Considering a multiply made up of high and low parts, we can split the +/// multiply into: +/// x * y == (xh*T + xl) * (yh*T + yl) +/// where xh == x>>32 and xl == x & 0xffffffff. T = 2^32. +/// This expands to +/// xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl +/// which can be drawn as +/// [ xh*yh ] +/// [ xh*yl ] +/// [ xl*yh ] +/// [ xl*yl ] +/// We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 + +/// some carrys. The carry makes this difficult and there are multiple ways of +/// representing it. The ones we attempt to support here are: +/// Carry: xh*yh + carry + lowsum +/// carry = lowsum < xh*yl ? 0x1000000 : 0 +/// lowsum = xh*yl + xl*yh + (xl*yl>>32) +/// Ladder: xh*yh + c2>>32 + c3>>32 +/// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh +/// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32 +/// crosssum = xh*yl + xl*yh +/// carry = crosssum < xh*yl ? 0x1000000 : 0 +/// Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32; +/// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff +/// +/// They all start by matching xh*yh + 2 or 3 other operands. The bottom of the +/// tree is xh*yh, xh*yl, xl*yh and xl*yl. +static bool foldMulHigh(Instruction &I) { + Type *Ty = I.getType(); + if (!Ty->isIntOrIntVectorTy()) + return false; + + unsigned BW = Ty->getScalarSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(BW, BW / 2); + if (BW % 2 != 0) + return false; + + auto CreateMulHigh = [&](Value *X, Value *Y) { + IRBuilder<> Builder(&I); + Type *NTy = Ty->getWithNewBitWidth(BW * 2); + Value *XExt = Builder.CreateZExt(X, NTy); + Value *YExt = Builder.CreateZExt(Y, NTy); + Value *Mul = Builder.CreateMul(XExt, YExt); + Value *High = Builder.CreateLShr(Mul, BW); + Value *Res = Builder.CreateTrunc(High, Ty); + I.replaceAllUsesWith(Res); + LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and " + << *Y << "\n"); + return true; + }; + + // Common check routines for X_lo*Y_lo and X_hi*Y_lo + auto CheckLoLo = [&](Value *XlYl, Value *X, Value *Y) { + return match(XlYl, m_c_Mul(m_And(m_Specific(X), m_SpecificInt(LowMask)), + m_And(m_Specific(Y), m_SpecificInt(LowMask)))); + }; + auto CheckHiLo = [&](Value *XhYl, Value *X, Value *Y) { + return match(XhYl, m_c_Mul(m_LShr(m_Specific(X), m_SpecificInt(BW / 2)), + m_And(m_Specific(Y), m_SpecificInt(LowMask)))); + }; + + auto foldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry, + Instruction *B) { + // Looking for LowSum >> 32 and carry (select) + if (Carry->getOpcode() != Instruction::Select) + std::swap(Carry, B); + + // Carry = LowSum < XhYl ? 0x100000000 : 0 + CmpPredicate Pred; + Value *LowSum, *XhYl; + if (!match(Carry, + m_OneUse(m_Select( + m_OneUse(m_ICmp(Pred, m_Value(LowSum), m_Value(XhYl))), + m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) || + Pred != ICmpInst::ICMP_ULT) + return false; + + // XhYl can be Xh*Yl or Xl*Yh + if (!CheckHiLo(XhYl, X, Y)) { + if (CheckHiLo(XhYl, Y, X)) + std::swap(X, Y); + else + return false; + } + if (XhYl->hasNUsesOrMore(3)) + return false; + + // B = LowSum >> 16 + if (!match(B, + m_OneUse(m_LShr(m_Specific(LowSum), m_SpecificInt(BW / 2)))) || + LowSum->hasNUsesOrMore(3)) + return false; + + // LowSum = XhYl + XlYh + XlYl>>32 + Value *XlYh, *XlYl; + auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)); + if (!match(LowSum, + m_c_Add(m_Specific(XhYl), + m_OneUse(m_c_Add(m_OneUse(m_Value(XlYh)), XlYlHi)))) && + !match(LowSum, m_c_Add(m_OneUse(m_Value(XlYh)), + m_OneUse(m_c_Add(m_Specific(XhYl), XlYlHi)))) && + !match(LowSum, + m_c_Add(XlYlHi, m_OneUse(m_c_Add(m_Specific(XhYl), + m_OneUse(m_Value(XlYh))))))) + return false; + + // Check XlYl and XlYh + if (!CheckLoLo(XlYl, X, Y)) + return false; + if (!CheckHiLo(XlYh, Y, X)) + return false; + + return CreateMulHigh(X, Y); + }; + + auto foldMulHighLadder = [&](Value *X, Value *Y, Instruction *A, + Instruction *B) { + // xh*yh + c2>>32 + c3>>32 + // c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh + Value *XlYh, *XhYl, *C2, *C3; + // Strip off the two expected shifts. + if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BW / 2))) || + !match(B, m_LShr(m_Value(C3), m_SpecificInt(BW / 2)))) + return false; + + // Match c3 = c2&0xffffffff + xl*yh + if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)), + m_Value(XhYl)))) + std::swap(C2, C3); + if (!match(C3, + m_c_Add(m_OneUse(m_And(m_Specific(C2), m_SpecificInt(LowMask))), + m_Value(XhYl))) || + !C3->hasOneUse() || C2->hasNUsesOrMore(3)) + return false; + + // Match c2 = xh*yl + (xl*yl >> 32) + Value *XlYl; + if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), + m_Value(XlYh)))) + return false; + + // Match XhYl and XlYh - they can appear either way around. + if (!CheckHiLo(XlYh, Y, X)) + std::swap(XlYh, XhYl); + if (!CheckHiLo(XlYh, Y, X)) + return false; + if (!CheckHiLo(XhYl, X, Y)) + return false; + if (!CheckLoLo(XlYl, X, Y)) + return false; + + return CreateMulHigh(X, Y); + }; + + auto foldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A, + Instruction *B, Instruction *C) { + /// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32; + /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff + + // Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32. + auto ShiftAdd = m_LShr(m_Add(m_Value(), m_Value()), m_SpecificInt(BW / 2)); + if (!match(A, ShiftAdd)) + std::swap(A, B); + if (!match(A, ShiftAdd)) + std::swap(A, C); + Value *Low; + if (!match(A, m_LShr(m_OneUse(m_Value(Low)), m_SpecificInt(BW / 2)))) + return false; + + // Match B == XhYl>>32 and C == XlYh>>32 + Value *XhYl, *XlYh; + if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BW / 2))) || + !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BW / 2)))) + return false; + if (!CheckHiLo(XhYl, X, Y)) + std::swap(XhYl, XlYh); + if (!CheckHiLo(XhYl, X, Y) || XhYl->hasNUsesOrMore(3)) + return false; + if (!CheckHiLo(XlYh, Y, X) || XlYh->hasNUsesOrMore(3)) + return false; + + // Match Low as XlYl>>32 + XhYl&0xffffffff + XlYh&0xffffffff + Value *XlYl; + if (!match( + Low, + m_c_Add( + m_OneUse(m_c_Add( + m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))), + m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))), + m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))) && + !match( + Low, + m_c_Add( + m_OneUse(m_c_Add( + m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))), + m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))), + m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))) && + !match( + Low, + m_c_Add( + m_OneUse(m_c_Add( + m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))), + m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))), + m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask)))))) + return false; + if (!CheckLoLo(XlYl, X, Y)) + return false; + + return CreateMulHigh(X, Y); + }; + + auto foldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry, + Instruction *B, Instruction *C) { + // xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32 + // crosssum = xh*yl+xl*yh + // carry = crosssum < xh*yl ? 0x1000000 : 0 + if (Carry->getOpcode() != Instruction::Select) + std::swap(Carry, B); + if (Carry->getOpcode() != Instruction::Select) + std::swap(Carry, C); + + // Carry = CrossSum < XhYl ? 0x100000000 : 0 + CmpPredicate Pred; + Value *CrossSum, *XhYl; + if (!match(Carry, + m_OneUse(m_Select( + m_OneUse(m_ICmp(Pred, m_Value(CrossSum), m_Value(XhYl))), + m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) || + Pred != ICmpInst::ICMP_ULT) + return false; + + if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) + std::swap(B, C); + if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) + return false; + + Value *XlYl, *LowAccum; + if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BW / 2))) || + !match(LowAccum, + m_c_Add(m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))), + m_OneUse(m_And(m_Specific(CrossSum), + m_SpecificInt(LowMask))))) || + LowAccum->hasNUsesOrMore(3)) + return false; + if (!CheckLoLo(XlYl, X, Y)) + return false; + + if (!CheckHiLo(XhYl, X, Y)) + std::swap(X, Y); + if (!CheckHiLo(XhYl, X, Y)) + return false; + if (!match(CrossSum, + m_c_Add(m_Specific(XhYl), + m_OneUse(m_c_Mul( + m_LShr(m_Specific(Y), m_SpecificInt(BW / 2)), + m_And(m_Specific(X), m_SpecificInt(LowMask)))))) || + CrossSum->hasNUsesOrMore(4) || XhYl->hasNUsesOrMore(3)) + return false; + + return CreateMulHigh(X, Y); + }; + + // X and Y are the two inputs, A, B and C are other parts of the pattern + // (crosssum>>32, carry, etc). + Value *X, *Y; + Instruction *A, *B, *C; + auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BW / 2)), + m_LShr(m_Value(Y), m_SpecificInt(BW / 2)))); + if ((match(&I, m_c_Add(HiHi, m_OneUse(m_Add(m_Instruction(A), + m_Instruction(B))))) || + match(&I, m_c_Add(m_Instruction(A), + m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) && + A->hasOneUse() && B->hasOneUse()) + if (foldMulHighCarry(X, Y, A, B) || foldMulHighLadder(X, Y, A, B)) + return true; + + if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add( + m_Instruction(A), + m_OneUse(m_Add(m_Instruction(B), + m_Instruction(C))))))) || + match(&I, m_c_Add(m_Instruction(A), + m_OneUse(m_c_Add( + HiHi, m_OneUse(m_Add(m_Instruction(B), + m_Instruction(C))))))) || + match(&I, m_c_Add(m_Instruction(A), + m_OneUse(m_c_Add( + m_Instruction(B), + m_OneUse(m_c_Add(HiHi, m_Instruction(C))))))) || + match(&I, + m_c_Add(m_OneUse(m_c_Add(HiHi, m_Instruction(A))), + m_OneUse(m_Add(m_Instruction(B), m_Instruction(C)))))) && + A->hasOneUse() && B->hasOneUse() && C->hasOneUse()) + return foldMulHighCarry4(X, Y, A, B, C) || + foldMulHighLadder4(X, Y, A, B, C); + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -1495,6 +1795,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT, MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT); MadeChange |= foldPatternedLoads(I, DL); MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT); + MadeChange |= foldMulHigh(I); // NOTE: This function introduces erasing of the instruction `I`, so it // needs to be called at the end of this sequence, otherwise we may make // bugs. diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll index b9801370028cc..b78095cac0df9 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll @@ -6,22 +6,11 @@ define i32 @mul_carry(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_carry( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] -; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] -; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] -; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 -; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] -; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] -; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[ADD11]] ; entry: @@ -49,22 +38,11 @@ define i128 @mul_carry_i128(i128 %x, i128 %y) { ; CHECK-LABEL: define i128 @mul_carry_i128( ; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i128 [[X]], 64 -; CHECK-NEXT: [[AND:%.*]] = and i128 [[X]], 18446744073709551615 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i128 [[Y]], 64 -; CHECK-NEXT: [[AND2:%.*]] = and i128 [[Y]], 18446744073709551615 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i128 [[SHR]], [[AND2]] -; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i128 [[AND]], [[SHR1]] -; CHECK-NEXT: [[ADD:%.*]] = add i128 [[MUL]], [[MUL3]] -; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i128 [[AND]], [[AND2]] -; CHECK-NEXT: [[SHR5:%.*]] = lshr i128 [[MUL4]], 64 -; CHECK-NEXT: [[ADD6:%.*]] = add i128 [[ADD]], [[SHR5]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i128 [[ADD6]], [[MUL]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i128 18446744073709551616, i128 0 -; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i128 [[SHR]], [[SHR1]] -; CHECK-NEXT: [[ADD9:%.*]] = add nuw i128 [[MUL8]], [[COND]] -; CHECK-NEXT: [[SHR10:%.*]] = lshr i128 [[ADD6]], 64 -; CHECK-NEXT: [[ADD11:%.*]] = add i128 [[ADD9]], [[SHR10]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i256 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i256 [[TMP2]], 128 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i256 [[TMP3]] to i128 ; CHECK-NEXT: ret i128 [[ADD11]] ; entry: @@ -92,22 +70,11 @@ define <4 x i32> @mul_carry_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: define <4 x i32> @mul_carry_v4i32( ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 16) -; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[X]], splat (i32 65535) -; CHECK-NEXT: [[SHR1:%.*]] = lshr <4 x i32> [[Y]], splat (i32 16) -; CHECK-NEXT: [[AND2:%.*]] = and <4 x i32> [[Y]], splat (i32 65535) -; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[SHR]], [[AND2]] -; CHECK-NEXT: [[MUL3:%.*]] = mul nuw <4 x i32> [[AND]], [[SHR1]] -; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[MUL]], [[MUL3]] -; CHECK-NEXT: [[MUL4:%.*]] = mul nuw <4 x i32> [[AND]], [[AND2]] -; CHECK-NEXT: [[SHR5:%.*]] = lshr <4 x i32> [[MUL4]], splat (i32 16) -; CHECK-NEXT: [[ADD6:%.*]] = add <4 x i32> [[ADD]], [[SHR5]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[ADD6]], [[MUL]] -; CHECK-NEXT: [[COND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> splat (i32 65536), <4 x i32> zeroinitializer -; CHECK-NEXT: [[MUL8:%.*]] = mul nuw <4 x i32> [[SHR]], [[SHR1]] -; CHECK-NEXT: [[ADD9:%.*]] = add nuw <4 x i32> [[MUL8]], [[COND]] -; CHECK-NEXT: [[SHR10:%.*]] = lshr <4 x i32> [[ADD6]], splat (i32 16) -; CHECK-NEXT: [[ADD11:%.*]] = add <4 x i32> [[ADD9]], [[SHR10]] +; CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i32> [[X]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[Y]] to <4 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw <4 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], splat (i64 32) +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw <4 x i64> [[TMP3]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[ADD11]] ; entry: @@ -135,22 +102,11 @@ define i32 @mul_carry_xlyh(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_carry_xlyh( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] -; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] -; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] -; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL3]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 -; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] -; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] -; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[ADD11]] ; entry: @@ -177,22 +133,11 @@ define i32 @mul_carry_comm(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_carry_comm( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[AND2]], [[SHR]] -; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[SHR1]], [[AND]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL3]], [[MUL]] -; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] -; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[SHR5]], [[ADD]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 -; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] -; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 -; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i32 [[COND]], [[SHR10]] -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[MUL8]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[ADD11]] ; entry: @@ -520,22 +465,15 @@ define i32 @mul_carry_use_llh(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_carry_use_llh( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] -; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] ; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]] ; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 -; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD]], [[SHR10]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD7]], [[MUL]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 -; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] -; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] -; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD7]], 16 -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR11]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR10]]) ; CHECK-NEXT: ret i32 [[ADD11]] ; @@ -564,22 +502,14 @@ define i32 @mul_carry_use_mulll(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_carry_use_mulll( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16 ; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]] -; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]] ; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]] -; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16 -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0 -; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]] -; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]] -; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16 -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL4]]) ; CHECK-NEXT: ret i32 [[ADD11]] ; diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll index d92434a7a7ea5..fa21721f17762 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll @@ -5,25 +5,11 @@ define i64 @umulh(i64 %x, i64 %y) { ; CHECK-LABEL: define i64 @umulh( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64 ; CHECK-NEXT: ret i64 [[TMP4]] ; ; Extract low and high 32 bits @@ -70,25 +56,11 @@ define i64 @umulh(i64 %x, i64 %y) { define i64 @umulh__commuted(i64 %x, i64 %y) { ; CHECK-LABEL: define i64 @umulh__commuted( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_LO_X_HI]], [[Y_HI_X_LO]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[Y_LO_X_LO_HI]], [[CROSS_SUM_LO]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[CARRY]], [[INTERMEDIATE]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[LOW_ACCUM_HI]], [[INTERMEDIATE_PLUS_CARRY]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64 ; CHECK-NEXT: ret i64 [[TMP4]] ; ; Extract low and high 32 bits @@ -132,25 +104,11 @@ define i32 @mulh_src32(i32 %x, i32 %y) { ; Extract low and high 16 bits ; CHECK-LABEL: define i32 @mulh_src32( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i32 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i32 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i32 65536, i32 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i32 [[Y_LO_X_LO]], 16 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i32 [[CROSS_SUM]], 65535 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i32 [[CROSS_SUM]], 16 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i32 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i32 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i32 [[LOW_ACCUM]], 16 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i32 [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i64 [[TMP4]] to i32 ; CHECK-NEXT: ret i32 [[TMP5]] ; %x_lo = and i32 %x, u0xffff ; x & 0xffffffff @@ -193,25 +151,11 @@ define i128 @mulh_src128(i128 %x, i128 %y) { ; Extract low and high 64 bits ; CHECK-LABEL: define i128 @mulh_src128( ; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615 -; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i128 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i128 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i128 18446744073709551616, i128 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i128 [[Y_LO_X_LO]], 64 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i128 [[CROSS_SUM]], 18446744073709551615 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i128 [[CROSS_SUM]], 64 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i128 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i128 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i128 [[LOW_ACCUM]], 64 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i128 [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[HW64:%.*]] = add i128 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i256 [[TMP3]], 128 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128 ; CHECK-NEXT: ret i128 [[HW64]] ; %x_lo = and i128 %x, u0xffffffffffffffff ; x & 0xffffffff @@ -254,25 +198,11 @@ define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) { ; Extract low and high 16 bits ; CHECK-LABEL: define <2 x i32> @mulh_v2i32( ; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535) -; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535) -; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16) -; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16) -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add <2 x i32> [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult <2 x i32> [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select <2 x i1> [[CARRY_OUT]], <2 x i32> splat (i32 65536), <2 x i32> zeroinitializer -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr <2 x i32> [[Y_LO_X_LO]], splat (i32 16) -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and <2 x i32> [[CROSS_SUM]], splat (i32 65535) -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr <2 x i32> [[CROSS_SUM]], splat (i32 16) -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw <2 x i32> [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw <2 x i32> [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr <2 x i32> [[LOW_ACCUM]], splat (i32 16) -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add <2 x i32> [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[HW64:%.*]] = add <2 x i32> [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[Y]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32) +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[HW64]] ; %x_lo = and <2 x i32> %x, @@ -315,30 +245,14 @@ define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) { define void @full_mul_int128(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: define void @full_mul_int128( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64 ; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 ; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8 -; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 -; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[X]], [[Y]] ; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8 ; CHECK-NEXT: ret void ; @@ -831,24 +745,11 @@ define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) { ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { ; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]]) -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[HW64]] ; ; Extract low and high 32 bits @@ -893,26 +794,13 @@ define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) { define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) { ; CHECK-LABEL: define i64 @umulh__mul_use__y_hi( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 ; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]]) -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[HW64]] ; ; Extract low and high 32 bits @@ -1154,24 +1042,13 @@ define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) { ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { ; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 ; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] ; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]]) -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[TMP5]] ; ; Extract low and high 32 bits @@ -1607,22 +1484,19 @@ define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) { ; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 ; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 ; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul i64 [[Y]], [[X_HI]] +; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul i64 [[Y_HI]], [[X]] ; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] ; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 ; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 ; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 ; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]]) -; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[TMP5]] ; ; Extract low and high 32 bits @@ -1862,29 +1736,14 @@ define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) { ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]]) -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 ; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 -; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 -; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] ; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 ; CHECK-NEXT: ret void ; @@ -1932,31 +1791,16 @@ define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) { define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 ; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO]]) -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 ; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 -; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 -; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] ; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 ; CHECK-NEXT: ret void ; @@ -2004,31 +1848,16 @@ define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) { define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 ; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_HI]]) -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 ; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 -; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 -; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] ; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 ; CHECK-NEXT: ret void ; @@ -2076,31 +1905,16 @@ define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) { define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 ; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]]) -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 ; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8 -; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 -; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] ; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 ; CHECK-NEXT: ret void ; @@ -2369,27 +2183,20 @@ define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) { ; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 ; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 ; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] +; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul i64 [[Y]], [[X_HI]] +; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul i64 [[Y_HI]], [[X]] ; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]]) ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[TMP6]], [[LOW_ACCUM_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[TMP6]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[TMP6]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] -; CHECK-NEXT: [[LOW_ACCUM_HI1:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS1:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS1]], [[LOW_ACCUM_HI1]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 ; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 -; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 -; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] +; CHECK-NEXT: [[LOW_ACCUM1:%.*]] = shl i64 [[TMP6]], 32 +; CHECK-NEXT: [[LW64:%.*]] = add i64 [[Y_LO_X_LO]], [[LOW_ACCUM1]] ; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 ; CHECK-NEXT: ret void ; @@ -3157,31 +2964,16 @@ define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) { ; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]] -; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0 -; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32 -; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295 -; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32 -; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]] -; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]] -; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32 -; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 ; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8 -; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32 +; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]] +; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = and i64 [[LW64]], -4294967296 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]]) -; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295 -; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]] ; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll index 6e56eb86516c5..745c61923d0f8 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll @@ -5,22 +5,11 @@ define i64 @umulh_variant(i64 %x, i64 %y) { ; CHECK-LABEL: define i64 @umulh_variant( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] -; CHECK-NEXT: [[TMP5:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[TMP5]] ; %x_lo = and i64 %x, 4294967295 @@ -48,22 +37,11 @@ define i64 @umulh_variant(i64 %x, i64 %y) { define i32 @umulh_variant_i32(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @umulh_variant_i32( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr i32 [[T0]], 16 -; CHECK-NEXT: [[U0:%.*]] = add nuw i32 [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i32 [[U0]], 65535 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i32 [[U0]], 16 -; CHECK-NEXT: [[U1:%.*]] = add nuw i32 [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i32 [[U1]], 16 -; CHECK-NEXT: [[U2:%.*]] = add nuw i32 [[U0_HI]], [[T3]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw i32 [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 32 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i64 [[TMP4]] to i32 ; CHECK-NEXT: ret i32 [[HW64]] ; %x_lo = and i32 %x, u0xffff @@ -91,22 +69,11 @@ define i32 @umulh_variant_i32(i32 %x, i32 %y) { define <2 x i32> @umulh_variant_v2i32(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: define <2 x i32> @umulh_variant_v2i32( ; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535) -; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535) -; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16) -; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16) -; CHECK-NEXT: [[T0:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr <2 x i32> [[T0]], splat (i32 16) -; CHECK-NEXT: [[U0:%.*]] = add nuw <2 x i32> [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and <2 x i32> [[U0]], splat (i32 65535) -; CHECK-NEXT: [[U0_HI:%.*]] = lshr <2 x i32> [[U0]], splat (i32 16) -; CHECK-NEXT: [[U1:%.*]] = add nuw <2 x i32> [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr <2 x i32> [[U1]], splat (i32 16) -; CHECK-NEXT: [[U2:%.*]] = add nuw <2 x i32> [[U0_HI]], [[T3]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw <2 x i32> [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[Y]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32) +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[HW64]] ; %x_lo = and <2 x i32> %x, @@ -134,22 +101,11 @@ define <2 x i32> @umulh_variant_v2i32(<2 x i32> %x, <2 x i32> %y) { define i128 @umulh_variant_i128(i128 %x, i128 %y) { ; CHECK-LABEL: define i128 @umulh_variant_i128( ; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615 -; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr i128 [[T0]], 64 -; CHECK-NEXT: [[U0:%.*]] = add nuw i128 [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i128 [[U0]], 18446744073709551615 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i128 [[U0]], 64 -; CHECK-NEXT: [[U1:%.*]] = add nuw i128 [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i128 [[U1]], 64 -; CHECK-NEXT: [[U2:%.*]] = add nuw i128 [[U0_HI]], [[T3]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw i128 [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i256 [[TMP3]], 128 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128 ; CHECK-NEXT: ret i128 [[HW64]] ; %x_lo = and i128 %x, u0xffffffffffffffff @@ -177,22 +133,11 @@ define i128 @umulh_variant_i128(i128 %x, i128 %y) { define i64 @umulh_variant_commuted(i64 %x, i64 %y) { ; CHECK-LABEL: define i64 @umulh_variant_commuted( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[X_HI]], [[Y_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T1]], [[T0_HI]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[T2]], [[U0_LO]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add nuw nsw i64 [[U1_HI]], [[U0_HI]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[T3]], [[U2]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[HW64]] ; %x_lo = and i64 %x, 4294967295 @@ -403,21 +348,13 @@ define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) { ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { ; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 ; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 ; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0]]) -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[HW64]] ; %x_lo = and i64 %x, 4294967295 @@ -447,23 +384,15 @@ define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) { define i64 @umulh_variant__mul_use__t1(i64 %x, i64 %y) { ; CHECK-LABEL: define i64 @umulh_variant__mul_use__t1( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { -; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 ; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 ; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] ; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T1]]) -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[HW64]] ; %x_lo = and i64 %x, 4294967295 @@ -494,22 +423,14 @@ define i64 @umulh_variant__mul_use__t2(i64 %x, i64 %y) { ; CHECK-LABEL: define i64 @umulh_variant__mul_use__t2( ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { ; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 ; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] ; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T2]]) -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] -; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[HW64]] ; %x_lo = and i64 %x, 4294967295 @@ -587,21 +508,14 @@ define i64 @umulh_variant__mul_use__t0_hi(i64 %x, i64 %y) { ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) { ; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295 ; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32 ; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]] ; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32 ; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0_HI]]) -; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]] -; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]] -; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]] -; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64 +; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64 ; CHECK-NEXT: ret i64 [[HW64]] ; %x_lo = and i64 %x, 4294967295 diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll index 5f84bc4e93b82..307fc62a6b4ba 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll @@ -6,25 +6,11 @@ define i32 @mul_ladder4(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_ladder4( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]] -; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]] -; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]] -; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]] -; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 -; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]] -; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 -; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]] -; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 -; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 -; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]] -; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 -; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]] -; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[ADD19]] ; entry: @@ -54,25 +40,11 @@ define <2 x i32> @mul_ladder4_v2i32(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: define <2 x i32> @mul_ladder4_v2i32( ; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[XL:%.*]] = and <2 x i32> [[X]], splat (i32 65535) -; CHECK-NEXT: [[XH:%.*]] = lshr <2 x i32> [[X]], splat (i32 16) -; CHECK-NEXT: [[YL:%.*]] = and <2 x i32> [[Y]], splat (i32 65535) -; CHECK-NEXT: [[YH:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16) -; CHECK-NEXT: [[MULLL:%.*]] = mul nuw <2 x i32> [[XL]], [[YL]] -; CHECK-NEXT: [[MULLH:%.*]] = mul nuw <2 x i32> [[XL]], [[YH]] -; CHECK-NEXT: [[MULHL:%.*]] = mul nuw <2 x i32> [[XH]], [[YL]] -; CHECK-NEXT: [[MULHH:%.*]] = mul nuw <2 x i32> [[XH]], [[YH]] -; CHECK-NEXT: [[SHR8:%.*]] = lshr <2 x i32> [[MULLL]], splat (i32 16) -; CHECK-NEXT: [[CONV10:%.*]] = and <2 x i32> [[MULLH]], splat (i32 65535) -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i32> [[SHR8]], [[CONV10]] -; CHECK-NEXT: [[CONV12:%.*]] = and <2 x i32> [[MULHL]], splat (i32 65535) -; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw <2 x i32> [[ADD]], [[CONV12]] -; CHECK-NEXT: [[SHR14:%.*]] = lshr <2 x i32> [[ADD13]], splat (i32 16) -; CHECK-NEXT: [[SHR15:%.*]] = lshr <2 x i32> [[MULLH]], splat (i32 16) -; CHECK-NEXT: [[ADD16:%.*]] = add nuw <2 x i32> [[MULHH]], [[SHR15]] -; CHECK-NEXT: [[SHR17:%.*]] = lshr <2 x i32> [[MULHL]], splat (i32 16) -; CHECK-NEXT: [[ADD18:%.*]] = add nuw <2 x i32> [[ADD16]], [[SHR17]] -; CHECK-NEXT: [[ADD19:%.*]] = add nuw <2 x i32> [[ADD18]], [[SHR14]] +; CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[X]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[Y]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32) +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw <2 x i64> [[TMP3]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[ADD19]] ; entry: @@ -102,25 +74,11 @@ define i128 @mul_ladder4_i128(i128 %x, i128 %y) { ; CHECK-LABEL: define i128 @mul_ladder4_i128( ; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[XL:%.*]] = and i128 [[X]], 18446744073709551615 -; CHECK-NEXT: [[XH:%.*]] = lshr i128 [[X]], 64 -; CHECK-NEXT: [[YL:%.*]] = and i128 [[Y]], 18446744073709551615 -; CHECK-NEXT: [[YH:%.*]] = lshr i128 [[Y]], 64 -; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i128 [[XL]], [[YL]] -; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i128 [[XL]], [[YH]] -; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i128 [[XH]], [[YL]] -; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i128 [[XH]], [[YH]] -; CHECK-NEXT: [[SHR8:%.*]] = lshr i128 [[MULLL]], 64 -; CHECK-NEXT: [[CONV10:%.*]] = and i128 [[MULLH]], 18446744073709551615 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i128 [[SHR8]], [[CONV10]] -; CHECK-NEXT: [[CONV12:%.*]] = and i128 [[MULHL]], 18446744073709551615 -; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i128 [[ADD]], [[CONV12]] -; CHECK-NEXT: [[SHR14:%.*]] = lshr i128 [[ADD13]], 64 -; CHECK-NEXT: [[SHR15:%.*]] = lshr i128 [[MULLH]], 64 -; CHECK-NEXT: [[ADD16:%.*]] = add nuw i128 [[MULHH]], [[SHR15]] -; CHECK-NEXT: [[SHR17:%.*]] = lshr i128 [[MULHL]], 64 -; CHECK-NEXT: [[ADD18:%.*]] = add nuw i128 [[ADD16]], [[SHR17]] -; CHECK-NEXT: [[ADD19:%.*]] = add nuw i128 [[ADD18]], [[SHR14]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i128 [[X]] to i256 +; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i256 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i256 [[TMP2]], 128 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i256 [[TMP3]] to i128 ; CHECK-NEXT: ret i128 [[ADD19]] ; entry: @@ -150,25 +108,11 @@ define i32 @mul_ladder4_commutted(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_ladder4_commutted( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]] -; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]] -; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]] -; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]] -; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 -; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]] -; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535 -; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]] -; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 -; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16 -; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16 -; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]] -; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]] -; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[MULHH]], [[ADD18]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[ADD19]] ; entry: @@ -198,25 +142,11 @@ define i32 @mul_ladder4_swap_hl_lh(i32 %x, i32 %y) { ; CHECK-LABEL: define i32 @mul_ladder4_swap_hl_lh( ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535 -; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535 -; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]] -; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]] -; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]] -; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]] -; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16 -; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULHL]], 65535 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]] -; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULLH]], 65535 -; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]] -; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16 -; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16 -; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]] -; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULLH]], 16 -; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]] -; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32 +; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[ADD19]] ; entry: From 87e377b3450b5c8039de7a61104ed942181fa6c1 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 19 Nov 2025 08:59:53 +0000 Subject: [PATCH 3/5] Address comments --- .../AggressiveInstCombine.cpp | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index fb71f57eaa502..12ee2fe4efd83 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1513,6 +1513,7 @@ static bool foldMulHigh(Instruction &I) { Value *Mul = Builder.CreateMul(XExt, YExt); Value *High = Builder.CreateLShr(Mul, BW); Value *Res = Builder.CreateTrunc(High, Ty); + Res->takeName(&I); I.replaceAllUsesWith(Res); LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and " << *Y << "\n"); @@ -1529,20 +1530,20 @@ static bool foldMulHigh(Instruction &I) { m_And(m_Specific(Y), m_SpecificInt(LowMask)))); }; - auto foldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry, + auto FoldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry, Instruction *B) { // Looking for LowSum >> 32 and carry (select) if (Carry->getOpcode() != Instruction::Select) std::swap(Carry, B); // Carry = LowSum < XhYl ? 0x100000000 : 0 - CmpPredicate Pred; Value *LowSum, *XhYl; if (!match(Carry, m_OneUse(m_Select( - m_OneUse(m_ICmp(Pred, m_Value(LowSum), m_Value(XhYl))), - m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) || - Pred != ICmpInst::ICMP_ULT) + m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(LowSum), + m_Value(XhYl))), + m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), + m_SpecificInt(0))))) return false; // XhYl can be Xh*Yl or Xl*Yh @@ -1583,7 +1584,7 @@ static bool foldMulHigh(Instruction &I) { return CreateMulHigh(X, Y); }; - auto foldMulHighLadder = [&](Value *X, Value *Y, Instruction *A, + auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A, Instruction *B) { // xh*yh + c2>>32 + c3>>32 // c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh @@ -1622,7 +1623,7 @@ static bool foldMulHigh(Instruction &I) { return CreateMulHigh(X, Y); }; - auto foldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A, + auto FoldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A, Instruction *B, Instruction *C) { /// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32; /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff @@ -1679,7 +1680,7 @@ static bool foldMulHigh(Instruction &I) { return CreateMulHigh(X, Y); }; - auto foldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry, + auto FoldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry, Instruction *B, Instruction *C) { // xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32 // crosssum = xh*yl+xl*yh @@ -1690,13 +1691,13 @@ static bool foldMulHigh(Instruction &I) { std::swap(Carry, C); // Carry = CrossSum < XhYl ? 0x100000000 : 0 - CmpPredicate Pred; Value *CrossSum, *XhYl; if (!match(Carry, m_OneUse(m_Select( - m_OneUse(m_ICmp(Pred, m_Value(CrossSum), m_Value(XhYl))), - m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) || - Pred != ICmpInst::ICMP_ULT) + m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, + m_Value(CrossSum), m_Value(XhYl))), + m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), + m_SpecificInt(0))))) return false; if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) @@ -1741,7 +1742,7 @@ static bool foldMulHigh(Instruction &I) { match(&I, m_c_Add(m_Instruction(A), m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) && A->hasOneUse() && B->hasOneUse()) - if (foldMulHighCarry(X, Y, A, B) || foldMulHighLadder(X, Y, A, B)) + if (FoldMulHighCarry(X, Y, A, B) || FoldMulHighLadder(X, Y, A, B)) return true; if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add( @@ -1760,8 +1761,8 @@ static bool foldMulHigh(Instruction &I) { m_c_Add(m_OneUse(m_c_Add(HiHi, m_Instruction(A))), m_OneUse(m_Add(m_Instruction(B), m_Instruction(C)))))) && A->hasOneUse() && B->hasOneUse() && C->hasOneUse()) - return foldMulHighCarry4(X, Y, A, B, C) || - foldMulHighLadder4(X, Y, A, B, C); + return FoldMulHighCarry4(X, Y, A, B, C) || + FoldMulHighLadder4(X, Y, A, B, C); return false; } From 5e8433bba888f8ffbde809d58466174b04102dda Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 20 Nov 2025 08:44:58 +0000 Subject: [PATCH 4/5] Address Comments 2 --- .../AggressiveInstCombine.cpp | 68 +++++++++++-------- .../AggressiveInstCombine/umulh_ladder.ll | 40 +++++++++++ 2 files changed, 80 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 12ee2fe4efd83..7e11b863a2869 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1487,6 +1487,7 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, /// lowsum = xh*yl + xl*yh + (xl*yl>>32) /// Ladder: xh*yh + c2>>32 + c3>>32 /// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh +/// or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xl*yh /// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32 /// crosssum = xh*yl + xl*yh /// carry = crosssum < xh*yl ? 0x1000000 : 0 @@ -1510,9 +1511,9 @@ static bool foldMulHigh(Instruction &I) { Type *NTy = Ty->getWithNewBitWidth(BW * 2); Value *XExt = Builder.CreateZExt(X, NTy); Value *YExt = Builder.CreateZExt(Y, NTy); - Value *Mul = Builder.CreateMul(XExt, YExt); + Value *Mul = Builder.CreateMul(XExt, YExt, "", true); Value *High = Builder.CreateLShr(Mul, BW); - Value *Res = Builder.CreateTrunc(High, Ty); + Value *Res = Builder.CreateTrunc(High, Ty, "", true); Res->takeName(&I); I.replaceAllUsesWith(Res); LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and " @@ -1542,8 +1543,7 @@ static bool foldMulHigh(Instruction &I) { m_OneUse(m_Select( m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(LowSum), m_Value(XhYl))), - m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), - m_SpecificInt(0))))) + m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero())))) return false; // XhYl can be Xh*Yl or Xl*Yh @@ -1556,7 +1556,7 @@ static bool foldMulHigh(Instruction &I) { if (XhYl->hasNUsesOrMore(3)) return false; - // B = LowSum >> 16 + // B = LowSum >> 32 if (!match(B, m_OneUse(m_LShr(m_Specific(LowSum), m_SpecificInt(BW / 2)))) || LowSum->hasNUsesOrMore(3)) @@ -1587,28 +1587,43 @@ static bool foldMulHigh(Instruction &I) { auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A, Instruction *B) { // xh*yh + c2>>32 + c3>>32 - // c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh - Value *XlYh, *XhYl, *C2, *C3; + // c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh + // or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xh*yl + Value *XlYh, *XhYl, *XlYl, *C2, *C3; // Strip off the two expected shifts. if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BW / 2))) || !match(B, m_LShr(m_Value(C3), m_SpecificInt(BW / 2)))) return false; - // Match c3 = c2&0xffffffff + xl*yh - if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)), - m_Value(XhYl)))) + if (match(C3, m_c_Add(m_Add(m_Value(), m_Value()), m_Value()))) std::swap(C2, C3); - if (!match(C3, - m_c_Add(m_OneUse(m_And(m_Specific(C2), m_SpecificInt(LowMask))), - m_Value(XhYl))) || - !C3->hasOneUse() || C2->hasNUsesOrMore(3)) - return false; + // Try to match c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32) + if (match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)), + m_Value(XlYh)), + m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)))) || + match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)), + m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))), + m_Value(XlYh))) || + match(C2, m_c_Add(m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), + m_Value(XlYh)), + m_And(m_Specific(C3), m_SpecificInt(LowMask))))) { + XhYl = C3; + } else { + // Match c3 = c2&0xffffffff + xl*yh + if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)), + m_Value(XlYh)))) + std::swap(C2, C3); + if (!match(C3, m_c_Add(m_OneUse( + m_And(m_Specific(C2), m_SpecificInt(LowMask))), + m_Value(XlYh))) || + !C3->hasOneUse() || C2->hasNUsesOrMore(3)) + return false; - // Match c2 = xh*yl + (xl*yl >> 32) - Value *XlYl; - if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), - m_Value(XlYh)))) - return false; + // Match c2 = xh*yl + (xl*yl >> 32) + if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), + m_Value(XhYl)))) + return false; + } // Match XhYl and XlYh - they can appear either way around. if (!CheckHiLo(XlYh, Y, X)) @@ -1696,8 +1711,7 @@ static bool foldMulHigh(Instruction &I) { m_OneUse(m_Select( m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(CrossSum), m_Value(XhYl))), - m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), - m_SpecificInt(0))))) + m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero())))) return false; if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) @@ -1720,12 +1734,10 @@ static bool foldMulHigh(Instruction &I) { std::swap(X, Y); if (!CheckHiLo(XhYl, X, Y)) return false; - if (!match(CrossSum, - m_c_Add(m_Specific(XhYl), - m_OneUse(m_c_Mul( - m_LShr(m_Specific(Y), m_SpecificInt(BW / 2)), - m_And(m_Specific(X), m_SpecificInt(LowMask)))))) || - CrossSum->hasNUsesOrMore(4) || XhYl->hasNUsesOrMore(3)) + Value *XlYh; + if (!match(CrossSum, m_c_Add(m_Specific(XhYl), m_OneUse(m_Value(XlYh)))) || + !CheckHiLo(XlYh, Y, X) || CrossSum->hasNUsesOrMore(4) || + XhYl->hasNUsesOrMore(3)) return false; return CreateMulHigh(X, Y); diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll index 745c61923d0f8..257cc0315c72f 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll @@ -816,3 +816,43 @@ define i64 @umulh_variant__mul_use__u2(i64 %x, i64 %y) { %hw64 = add nuw i64 %u2, %u1_hi ret i64 %hw64 } + +define [2 x i64] @XXH_mult64to128(i64 noundef %lhs, i64 noundef %rhs) { +; CHECK-LABEL: define [2 x i64] @XXH_mult64to128( +; CHECK-SAME: i64 noundef [[LHS:%.*]], i64 noundef [[RHS:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[RHS]] to i128 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[LHS]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i128 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64 +; CHECK-NEXT: [[ADD16:%.*]] = trunc nuw i128 [[TMP3]] to i64 +; CHECK-NEXT: [[SHR102:%.*]] = mul i64 [[LHS]], [[RHS]] +; CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[SHR102]], 0 +; CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[ADD16]], 1 +; CHECK-NEXT: ret [2 x i64] [[DOTFCA_1_INSERT]] +; +entry: + %and = and i64 %lhs, 4294967295 + %and1 = and i64 %rhs, 4294967295 + %mul.i = mul nuw i64 %and1, %and + %shr = lshr i64 %lhs, 32 + %mul.i27 = mul nuw i64 %and1, %shr + %shr5 = lshr i64 %rhs, 32 + %mul.i28 = mul nuw i64 %shr5, %and + %mul.i29 = mul nuw i64 %shr5, %shr + %shr10 = lshr i64 %mul.i, 32 + %and11 = and i64 %mul.i27, 4294967295 + %add = add nuw i64 %and11, %mul.i28 + %add12 = add nuw i64 %add, %shr10 + %shr13 = lshr i64 %mul.i27, 32 + %shr14 = lshr i64 %add12, 32 + %add15 = add nuw i64 %shr13, %mul.i29 + %add16 = add nuw i64 %add15, %shr14 + %shl = shl i64 %add12, 32 + %and17 = and i64 %mul.i, 4294967295 + %or = or disjoint i64 %shl, %and17 + %.fca.0.insert = insertvalue [2 x i64] poison, i64 %or, 0 + %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %add16, 1 + ret [2 x i64] %.fca.1.insert +} + From d0cb9f31ec2aaaa6368bc040289401c0078be697 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 26 Nov 2025 15:26:15 +0000 Subject: [PATCH 5/5] Addresss comments --- .../AggressiveInstCombine.cpp | 86 +++++++++++-------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 7e11b863a2869..7ed8fb68f107e 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -1501,19 +1501,19 @@ static bool foldMulHigh(Instruction &I) { if (!Ty->isIntOrIntVectorTy()) return false; - unsigned BW = Ty->getScalarSizeInBits(); - APInt LowMask = APInt::getLowBitsSet(BW, BW / 2); - if (BW % 2 != 0) + unsigned BitWidth = Ty->getScalarSizeInBits(); + APInt LowMask = APInt::getLowBitsSet(BitWidth, BitWidth / 2); + if (BitWidth % 2 != 0) return false; auto CreateMulHigh = [&](Value *X, Value *Y) { IRBuilder<> Builder(&I); - Type *NTy = Ty->getWithNewBitWidth(BW * 2); + Type *NTy = Ty->getWithNewBitWidth(BitWidth * 2); Value *XExt = Builder.CreateZExt(X, NTy); Value *YExt = Builder.CreateZExt(Y, NTy); - Value *Mul = Builder.CreateMul(XExt, YExt, "", true); - Value *High = Builder.CreateLShr(Mul, BW); - Value *Res = Builder.CreateTrunc(High, Ty, "", true); + Value *Mul = Builder.CreateMul(XExt, YExt, "", /*HasNUW=*/true); + Value *High = Builder.CreateLShr(Mul, BitWidth); + Value *Res = Builder.CreateTrunc(High, Ty, "", /*HasNUW=*/true); Res->takeName(&I); I.replaceAllUsesWith(Res); LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and " @@ -1527,8 +1527,9 @@ static bool foldMulHigh(Instruction &I) { m_And(m_Specific(Y), m_SpecificInt(LowMask)))); }; auto CheckHiLo = [&](Value *XhYl, Value *X, Value *Y) { - return match(XhYl, m_c_Mul(m_LShr(m_Specific(X), m_SpecificInt(BW / 2)), - m_And(m_Specific(Y), m_SpecificInt(LowMask)))); + return match(XhYl, + m_c_Mul(m_LShr(m_Specific(X), m_SpecificInt(BitWidth / 2)), + m_And(m_Specific(Y), m_SpecificInt(LowMask)))); }; auto FoldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry, @@ -1543,7 +1544,8 @@ static bool foldMulHigh(Instruction &I) { m_OneUse(m_Select( m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(LowSum), m_Value(XhYl))), - m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero())))) + m_SpecificInt(APInt::getOneBitSet(BitWidth, BitWidth / 2)), + m_Zero())))) return false; // XhYl can be Xh*Yl or Xl*Yh @@ -1557,14 +1559,14 @@ static bool foldMulHigh(Instruction &I) { return false; // B = LowSum >> 32 - if (!match(B, - m_OneUse(m_LShr(m_Specific(LowSum), m_SpecificInt(BW / 2)))) || + if (!match(B, m_OneUse(m_LShr(m_Specific(LowSum), + m_SpecificInt(BitWidth / 2)))) || LowSum->hasNUsesOrMore(3)) return false; // LowSum = XhYl + XlYh + XlYl>>32 Value *XlYh, *XlYl; - auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)); + auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)); if (!match(LowSum, m_c_Add(m_Specific(XhYl), m_OneUse(m_c_Add(m_OneUse(m_Value(XlYh)), XlYlHi)))) && @@ -1591,20 +1593,23 @@ static bool foldMulHigh(Instruction &I) { // or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xh*yl Value *XlYh, *XhYl, *XlYl, *C2, *C3; // Strip off the two expected shifts. - if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BW / 2))) || - !match(B, m_LShr(m_Value(C3), m_SpecificInt(BW / 2)))) + if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BitWidth / 2))) || + !match(B, m_LShr(m_Value(C3), m_SpecificInt(BitWidth / 2)))) return false; if (match(C3, m_c_Add(m_Add(m_Value(), m_Value()), m_Value()))) std::swap(C2, C3); // Try to match c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32) - if (match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)), - m_Value(XlYh)), - m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)))) || + if (match(C2, + m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)), + m_Value(XlYh)), + m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)))) || match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)), - m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))), + m_LShr(m_Value(XlYl), + m_SpecificInt(BitWidth / 2))), m_Value(XlYh))) || - match(C2, m_c_Add(m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), + match(C2, m_c_Add(m_c_Add(m_LShr(m_Value(XlYl), + m_SpecificInt(BitWidth / 2)), m_Value(XlYh)), m_And(m_Specific(C3), m_SpecificInt(LowMask))))) { XhYl = C3; @@ -1620,7 +1625,7 @@ static bool foldMulHigh(Instruction &I) { return false; // Match c2 = xh*yl + (xl*yl >> 32) - if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)), + if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)), m_Value(XhYl)))) return false; } @@ -1644,19 +1649,20 @@ static bool foldMulHigh(Instruction &I) { /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff // Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32. - auto ShiftAdd = m_LShr(m_Add(m_Value(), m_Value()), m_SpecificInt(BW / 2)); + auto ShiftAdd = + m_LShr(m_Add(m_Value(), m_Value()), m_SpecificInt(BitWidth / 2)); if (!match(A, ShiftAdd)) std::swap(A, B); if (!match(A, ShiftAdd)) std::swap(A, C); Value *Low; - if (!match(A, m_LShr(m_OneUse(m_Value(Low)), m_SpecificInt(BW / 2)))) + if (!match(A, m_LShr(m_OneUse(m_Value(Low)), m_SpecificInt(BitWidth / 2)))) return false; // Match B == XhYl>>32 and C == XlYh>>32 Value *XhYl, *XlYh; - if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BW / 2))) || - !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BW / 2)))) + if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BitWidth / 2))) || + !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BitWidth / 2)))) return false; if (!CheckHiLo(XhYl, X, Y)) std::swap(XhYl, XlYh); @@ -1673,20 +1679,23 @@ static bool foldMulHigh(Instruction &I) { m_OneUse(m_c_Add( m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))), m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))), - m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))) && + m_OneUse( + m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))) && !match( Low, m_c_Add( m_OneUse(m_c_Add( m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))), - m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))), + m_OneUse( + m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))), m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))) && !match( Low, m_c_Add( m_OneUse(m_c_Add( m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))), - m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))), + m_OneUse( + m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))), m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask)))))) return false; if (!CheckLoLo(XlYl, X, Y)) @@ -1711,20 +1720,21 @@ static bool foldMulHigh(Instruction &I) { m_OneUse(m_Select( m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(CrossSum), m_Value(XhYl))), - m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero())))) + m_SpecificInt(APInt::getOneBitSet(BitWidth, BitWidth / 2)), + m_Zero())))) return false; - if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) + if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2)))) std::swap(B, C); - if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2)))) + if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2)))) return false; Value *XlYl, *LowAccum; - if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BW / 2))) || - !match(LowAccum, - m_c_Add(m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))), - m_OneUse(m_And(m_Specific(CrossSum), - m_SpecificInt(LowMask))))) || + if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BitWidth / 2))) || + !match(LowAccum, m_c_Add(m_OneUse(m_LShr(m_Value(XlYl), + m_SpecificInt(BitWidth / 2))), + m_OneUse(m_And(m_Specific(CrossSum), + m_SpecificInt(LowMask))))) || LowAccum->hasNUsesOrMore(3)) return false; if (!CheckLoLo(XlYl, X, Y)) @@ -1747,8 +1757,8 @@ static bool foldMulHigh(Instruction &I) { // (crosssum>>32, carry, etc). Value *X, *Y; Instruction *A, *B, *C; - auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BW / 2)), - m_LShr(m_Value(Y), m_SpecificInt(BW / 2)))); + auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BitWidth / 2)), + m_LShr(m_Value(Y), m_SpecificInt(BitWidth / 2)))); if ((match(&I, m_c_Add(HiHi, m_OneUse(m_Add(m_Instruction(A), m_Instruction(B))))) || match(&I, m_c_Add(m_Instruction(A),