-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CodeGen] Emit more efficient magic numbers for exact udivs #87161
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-x86 Author: AtariDreams (AtariDreams) ChangesFull diff: https://github.com/llvm/llvm-project/pull/87161.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 962f0d98e3be90..277ea629cbd9ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6080,7 +6080,6 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
// Shift the value upfront if it is even, so the LSB is one.
if (UseSRA) {
- // TODO: For UDIV use SRL instead of SRA.
SDNodeFlags Flags;
Flags.setExact(true);
Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags);
@@ -6090,6 +6089,73 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
}
+/// Given an exact UDIV by a constant, create a multiplication
+/// with the multiplicative inverse of the constant.
+static SDValue BuildExactUDIV(const TargetLowering &TLI, SDNode *N,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+ EVT ShSVT = ShVT.getScalarType();
+
+ bool UseSRL = false;
+ SmallVector<SDValue, 16> Shifts, Factors;
+
+ auto BuildUDIVPattern = [&](ConstantSDNode *C) {
+ if (C->isZero())
+ return false;
+ APInt Divisor = C->getAPIntValue();
+ unsigned Shift = Divisor.countr_zero();
+ if (Shift) {
+ Divisor.lshrInPlace(Shift);
+ UseSRL = true;
+ }
+ // Calculate the multiplicative inverse, using Newton's method.
+ APInt t;
+ APInt Factor = Divisor;
+ while ((t = Divisor * Factor) != 1)
+ Factor *= APInt(Divisor.getBitWidth(), 2) - t;
+ Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
+ Factors.push_back(DAG.getConstant(Factor, dl, SVT));
+ return true;
+ };
+
+ // Collect all magic values from the build vector.
+ if (!ISD::matchUnaryPredicate(Op1, BuildUDIVPattern))
+ return SDValue();
+
+ SDValue Shift, Factor;
+ if (Op1.getOpcode() == ISD::BUILD_VECTOR) {
+ Shift = DAG.getBuildVector(ShVT, dl, Shifts);
+ Factor = DAG.getBuildVector(VT, dl, Factors);
+ } else if (Op1.getOpcode() == ISD::SPLAT_VECTOR) {
+ assert(Shifts.size() == 1 && Factors.size() == 1 &&
+ "Expected matchUnaryPredicate to return one element for scalable "
+ "vectors");
+ Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
+ Factor = DAG.getSplatVector(VT, dl, Factors[0]);
+ } else {
+ assert(isa<ConstantSDNode>(Op1) && "Expected a constant");
+ Shift = Shifts[0];
+ Factor = Factors[0];
+ }
+
+ SDValue Res = Op0;
+
+ // Shift the value upfront if it is even, so the LSB is one.
+ if (UseSRL) {
+ SDNodeFlags Flags;
+ Flags.setExact(true);
+ Res = DAG.getNode(ISD::SRL, dl, VT, Res, Shift, Flags);
+ Created.push_back(Res.getNode());
+ }
+
+ return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
+}
+
SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
@@ -6349,6 +6415,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+ // If the udiv has an 'exact' bit we can use a simpler lowering.
+ if (N->getFlags().hasExact())
+ return BuildExactUDIV(*this, N, dl, DAG, Created);
+
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
new file mode 100644
index 00000000000000..271d11edff9a76
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define i32 @test1(i32 %x) {
+; X86-LABEL: test1:
+; X86: # %bb.0:
+; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %eax # imm = 0xC28F5C29
+; X86-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # %bb.0:
+; X64-NEXT: imull $-1030792151, %edi, %eax # imm = 0xC28F5C29
+; X64-NEXT: retq
+ %div = udiv exact i32 %x, 25
+ ret i32 %div
+}
+
+define i32 @test2(i32 %x) {
+; X86-LABEL: test2:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: imull $-1431655765, %eax, %eax # imm = 0xAAAAAAAB
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # %bb.0:
+; X64-NEXT: shrl $3, %edi
+; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB
+; X64-NEXT: retq
+ %div = udiv exact i32 %x, 24
+ ret i32 %div
+}
+
+define <4 x i32> @test3(<4 x i32> %x) {
+; X86-LABEL: test3:
+; X86: # %bb.0:
+; X86-NEXT: psrld $3, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # %bb.0:
+; X64-NEXT: vpsrld $3, %xmm0, %xmm0
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 24, i32 24>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test4(<4 x i32> %x) {
+; X86-LABEL: test4:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # %bb.0:
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145]
+; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 25, i32 25>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test5(<4 x i32> %x) {
+; X86-LABEL: test5:
+; X86: # %bb.0:
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $3, %xmm1
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test5:
+; X64: # %bb.0:
+; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test6(<4 x i32> %x) {
+; X86-LABEL: test6:
+; X86: # %bb.0:
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $3, %xmm1
+; X86-NEXT: psrld $1, %xmm0
+; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997]
+; X86-NEXT: pmuludq %xmm0, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test6:
+; X64: # %bb.0:
+; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test7(<4 x i32> %x) {
+; X86-LABEL: test7:
+; X86: # %bb.0:
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test7:
+; X64: # %bb.0:
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
+ ret <4 x i32> %div
+}
+
+define <4 x i32> @test8(<4 x i32> %x) {
+; X86-LABEL: test8:
+; X86: # %bb.0:
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psrld $3, %xmm1
+; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test8:
+; X64: # %bb.0:
+; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
+ %div = udiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
+ ret <4 x i32> %div
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you make the mirror globalisel change?
✅ With the latest revision this PR passed the C/C++ code formatter. |
2987445
to
4db90f3
Compare
7532edd
to
3e98249
Compare
c031895
to
724d53b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably need a scalable vector test for the SPLAT_VECTOR case?
I did that in the .mir example |
3aed2f5
to
88c06e4
Compare
Would be better to have it in the end-to-end IR test. I think we're overusing MIR tests, especially for optimizations like this. Too many things can go wrong in the pipeline as a whole such that the optimization isn't useful when testing MIR directly |
a1fc48b
to
3a9ac8c
Compare
… udivs Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel.
@arsenm Fixed! |
@arsenm Thoughts? |
Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel.
The formula is the same between unsigned exact divs and signed divs save for like arithmetic vs logical shift, according to Hacker's Delight.