[CodeGen] Emit more efficient magic numbers for exact udivs #87161

AtariDreams · 2024-03-30T16:34:15Z

Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel.

The formula is the same between unsigned exact divs and signed divs save for like arithmetic vs logical shift, according to Hacker's Delight.

llvmbot · 2024-03-30T16:34:45Z

@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-x86

Author: AtariDreams (AtariDreams)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/87161.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+71-1)
(added) llvm/test/CodeGen/X86/udiv-exact.ll (+171)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 962f0d98e3be90..277ea629cbd9ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6080,7 +6080,6 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
 
   // Shift the value upfront if it is even, so the LSB is one.
   if (UseSRA) {
-    // TODO: For UDIV use SRL instead of SRA.
     SDNodeFlags Flags;
     Flags.setExact(true);
     Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags);
@@ -6090,6 +6089,73 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
   return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
 }
 
+/// Given an exact UDIV by a constant, create a multiplication
+/// with the multiplicative inverse of the constant.
+static SDValue BuildExactUDIV(const TargetLowering &TLI, SDNode *N,
+                              const SDLoc &dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDNode *> &Created) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
+
+  bool UseSRL = false;
+  SmallVector<SDValue, 16> Shifts, Factors;
+
+  auto BuildUDIVPattern = [&](ConstantSDNode *C) {
+    if (C->isZero())
+      return false;
+    APInt Divisor = C->getAPIntValue();
+    unsigned Shift = Divisor.countr_zero();
+    if (Shift) {
+      Divisor.lshrInPlace(Shift);
+      UseSRL = true;
+    }
+    // Calculate the multiplicative inverse, using Newton's method.
+    APInt t;
+    APInt Factor = Divisor;
+    while ((t = Divisor * Factor) != 1)
+      Factor *= APInt(Divisor.getBitWidth(), 2) - t;
+    Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
+    Factors.push_back(DAG.getConstant(Factor, dl, SVT));
+    return true;
+  };
+
+  // Collect all magic values from the build vector.
+  if (!ISD::matchUnaryPredicate(Op1, BuildUDIVPattern))
+    return SDValue();
+
+  SDValue Shift, Factor;
+  if (Op1.getOpcode() == ISD::BUILD_VECTOR) {
+    Shift = DAG.getBuildVector(ShVT, dl, Shifts);
+    Factor = DAG.getBuildVector(VT, dl, Factors);
+  } else if (Op1.getOpcode() == ISD::SPLAT_VECTOR) {
+    assert(Shifts.size() == 1 && Factors.size() == 1 &&
+           "Expected matchUnaryPredicate to return one element for scalable "
+           "vectors");
+    Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
+    Factor = DAG.getSplatVector(VT, dl, Factors[0]);
+  } else {
+    assert(isa<ConstantSDNode>(Op1) && "Expected a constant");
+    Shift = Shifts[0];
+    Factor = Factors[0];
+  }
+
+  SDValue Res = Op0;
+
+  // Shift the value upfront if it is even, so the LSB is one.
+  if (UseSRL) {
+    SDNodeFlags Flags;
+    Flags.setExact(true);
+    Res = DAG.getNode(ISD::SRL, dl, VT, Res, Shift, Flags);
+    Created.push_back(Res.getNode());
+  }
+
+  return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
+}
+
 SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                               SelectionDAG &DAG,
                               SmallVectorImpl<SDNode *> &Created) const {
@@ -6349,6 +6415,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
       return SDValue();
   }
 
+  // If the udiv has an 'exact' bit we can use a simpler lowering.
+  if (N->getFlags().hasExact())
+    return BuildExactUDIV(*this, N, dl, DAG, Created);
+
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
new file mode 100644
index 00000000000000..271d11edff9a76
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define i32 @test1(i32 %x) {
+; X86-LABEL: test1:
+; X86:       # %bb.0:
+; X86-NEXT:    imull $-1030792151, {{[0-9]+}}(%esp), %eax # imm = 0xC28F5C29
+; X86-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # %bb.0:
+; X64-NEXT:    imull $-1030792151, %edi, %eax # imm = 0xC28F5C29
+; X64-NEXT:    retq
+  %div = udiv exact i32 %x, 25
+  ret i32 %div
+}
+
+define i32 @test2(i32 %x) {
+; X86-LABEL: test2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
+; X86-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # %bb.0:
+; X64-NEXT:    shrl $3, %edi
+; X64-NEXT:    imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT:    retq
+  %div = udiv exact i32 %x, 24
+  ret i32 %div
+}
+
+define <4 x i32> @test3(<4 x i32> %x) {
+; X86-LABEL: test3:
+; X86:       # %bb.0:
+; X86-NEXT:    psrld $3, %xmm0
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT:    pmuludq %xmm1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT:    pmuludq %xmm1, %xmm2
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test3:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrld $3, %xmm0, %xmm0
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 24, i32 24>
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @test4(<4 x i32> %x) {
+; X86-LABEL: test4:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT:    pmuludq %xmm1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT:    pmuludq %xmm1, %xmm2
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test4:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 25, i32 25>
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @test5(<4 x i32> %x) {
+; X86-LABEL: test5:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $3, %xmm1
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X86-NEXT:    movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145]
+; X86-NEXT:    pmuludq %xmm1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test5:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @test6(<4 x i32> %x) {
+; X86-LABEL: test6:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $3, %xmm1
+; X86-NEXT:    psrld $1, %xmm0
+; X86-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997]
+; X86-NEXT:    pmuludq %xmm0, %xmm1
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    movdqa %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test6:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @test7(<4 x i32> %x) {
+; X86-LABEL: test7:
+; X86:       # %bb.0:
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test7:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @test8(<4 x i32> %x) {
+; X86-LABEL: test8:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $3, %xmm1
+; X86-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X86-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531]
+; X86-NEXT:    pmuludq %xmm1, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test8:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    retq
+  %div = udiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
+  ret <4 x i32> %div
+}

arsenm

Can you make the mirror globalisel change?

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

github-actions · 2024-04-22T15:14:03Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

arsenm

Probably need a scalable vector test for the SPLAT_VECTOR case?

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

AtariDreams · 2024-05-04T00:59:40Z

Probably need a scalable vector test for the SPLAT_VECTOR case?

I did that in the .mir example

arsenm · 2024-05-07T20:36:17Z

Probably need a scalable vector test for the SPLAT_VECTOR case?

I did that in the .mir example

Would be better to have it in the end-to-end IR test. I think we're overusing MIR tests, especially for optimizations like this. Too many things can go wrong in the pipeline as a whole such that the optimization isn't useful when testing MIR directly

llvm/test/CodeGen/X86/udiv-exact.ll

… udivs Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel.

AtariDreams · 2024-06-10T14:12:57Z

@arsenm Fixed!

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

AtariDreams · 2024-06-20T17:47:06Z

@arsenm Thoughts?

llvmbot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Mar 30, 2024

arsenm reviewed Apr 2, 2024

View reviewed changes

jayfoad reviewed Apr 2, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp Outdated Show resolved Hide resolved

AtariDreams force-pushed the phii branch from 5e14e67 to 04fbf82 Compare April 6, 2024 16:28

AtariDreams force-pushed the phii branch from 04fbf82 to 9196d84 Compare April 22, 2024 15:10

llvmbot added the llvm:globalisel label Apr 22, 2024

AtariDreams force-pushed the phii branch 3 times, most recently from 2987445 to 4db90f3 Compare April 22, 2024 19:08

topperc reviewed Apr 22, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

AtariDreams changed the title ~~[SelectionDAG] Emit a more efficient magic number multiplication for exact udivs~~ [CodeGen] Emit a more efficient magic number multiplication for exact udivs Apr 22, 2024

AtariDreams force-pushed the phii branch from 7433ebd to c23f6ce Compare April 22, 2024 20:46

llvmbot added the backend:AArch64 label Apr 22, 2024

AtariDreams requested review from topperc, arsenm and jayfoad April 22, 2024 20:46

topperc reviewed Apr 22, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

AtariDreams force-pushed the phii branch 2 times, most recently from 7532edd to 3e98249 Compare April 22, 2024 20:59

topperc reviewed Apr 22, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Outdated Show resolved Hide resolved

AtariDreams force-pushed the phii branch 3 times, most recently from c031895 to 724d53b Compare April 23, 2024 01:19

jayfoad reviewed Apr 23, 2024

View reviewed changes

AtariDreams force-pushed the phii branch from 939cfc8 to a192a53 Compare May 1, 2024 12:37

arsenm reviewed May 3, 2024

View reviewed changes

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp Show resolved Hide resolved

AtariDreams force-pushed the phii branch from a192a53 to e1f1773 Compare May 4, 2024 00:54

AtariDreams changed the title ~~[CodeGen] Emit a more efficient magic number multiplication for exact udivs~~ [CodeGen]: Emit a more efficient magic number multiplication for exact udivs May 4, 2024

AtariDreams changed the title ~~[CodeGen]: Emit a more efficient magic number multiplication for exact udivs~~ [CodeGen] Emit a more efficient magic number multiplication for exact udivs May 4, 2024

AtariDreams force-pushed the phii branch 3 times, most recently from 3aed2f5 to 88c06e4 Compare May 4, 2024 20:37

AtariDreams force-pushed the phii branch from 88c06e4 to 21ff12e Compare May 7, 2024 23:54

arsenm reviewed May 8, 2024

View reviewed changes

llvm/test/CodeGen/X86/udiv-exact.ll Show resolved Hide resolved

AtariDreams force-pushed the phii branch 2 times, most recently from a1fc48b to 3a9ac8c Compare June 10, 2024 13:57

AtariDreams added 2 commits June 10, 2024 10:12

[CodeGen] Pre-commit tests (NFC)

a7a38ff

[CodeGen] Emit a more efficient magic number multiplication for exact…

83fbf26

… udivs Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel.

AtariDreams force-pushed the phii branch from 3a9ac8c to 83fbf26 Compare June 10, 2024 14:12

arsenm reviewed Jun 12, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp Show resolved Hide resolved

arsenm reviewed Jun 12, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp Show resolved Hide resolved

AtariDreams requested review from jayfoad, topperc and arsenm June 13, 2024 14:43

AtariDreams changed the title ~~[CodeGen] Emit a more efficient magic number multiplication for exact udivs~~ [CodeGen] Emit more efficient magic numbers for exact udivs Jun 23, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[CodeGen] Emit more efficient magic numbers for exact udivs #87161

[CodeGen] Emit more efficient magic numbers for exact udivs #87161

AtariDreams commented Mar 30, 2024 •

edited

Loading

llvmbot commented Mar 30, 2024 •

edited

Loading

arsenm left a comment

github-actions bot commented Apr 22, 2024 •

edited

Loading

arsenm left a comment

AtariDreams commented May 4, 2024

arsenm commented May 7, 2024

AtariDreams commented Jun 10, 2024

AtariDreams commented Jun 20, 2024

[CodeGen] Emit more efficient magic numbers for exact udivs #87161

Are you sure you want to change the base?

[CodeGen] Emit more efficient magic numbers for exact udivs #87161

Conversation

AtariDreams commented Mar 30, 2024 • edited Loading

llvmbot commented Mar 30, 2024 • edited Loading

arsenm left a comment

Choose a reason for hiding this comment

github-actions bot commented Apr 22, 2024 • edited Loading

arsenm left a comment

Choose a reason for hiding this comment

AtariDreams commented May 4, 2024

arsenm commented May 7, 2024

AtariDreams commented Jun 10, 2024

AtariDreams commented Jun 20, 2024

AtariDreams commented Mar 30, 2024 •

edited

Loading

llvmbot commented Mar 30, 2024 •

edited

Loading

github-actions bot commented Apr 22, 2024 •

edited

Loading