-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix non-splat vector SREM expansion when one of the divisors is a power of two. #82706
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-selectiondag Author: Owen Anderson (resistor) ChangesThe expansion previously used, derived from Hacker's Delight, Fixes #77169 Full diff: https://github.com/llvm/llvm-project/pull/82706.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 07fb89127a737c..21d484b1962524 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6803,6 +6803,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
SDValue CompTargetNode, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL,
SmallVectorImpl<SDNode *> &Created) const {
+ // Derived from Hacker's Delight, 2nd Edition, by Hank Warren. Section 10-17.
// Fold:
// (seteq/ne (srem N, D), 0)
// To:
@@ -6813,6 +6814,17 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
// - A = bitwiseand(floor((2^(W - 1) - 1) / D0), (-(2^k)))
// - Q = floor((2 * A) / (2^K))
// where W is the width of the common type of N and D.
+ //
+ // When D is a power of two (and thus D0 is 1), the normal
+ // formula for A and Q don't apply, because the derivation
+ // depends on D not dividing 2^(W-1), and thus theoroem ZRS
+ // does not apply. This specifically fails when N = INT_MIN.
+ //
+ // Instead, for power-of-two D, we use:
+ // - A = 2^(W-1)
+ // |-> Order-preserving map from [-2^(W-1), 2^(W-1) - 1] to [0,2^W - 1])
+ // - Q = 2^(W-K) - 1
+ // |-> Test that the top K bits are zero after rotation
assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
"Only applicable for (in)equality comparisons.");
@@ -6896,6 +6908,14 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
// Q = floor((2 * A) / (2^K))
APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));
+ // If D was a power of two, apply the alternate constant derivation.
+ if (D0.isOne()) {
+ // A = 2^(W-1)
+ A = APInt::getSignedMinValue(W);
+ // - Q = 2^(W-K) - 1
+ Q = APInt::getAllOnes(W - K).zext(W);
+ }
+
assert(APInt::getAllOnes(SVT.getSizeInBits()).ugt(A) &&
"We are expecting that A is always less than all-ones for SVT");
assert(APInt::getAllOnes(ShSVT.getSizeInBits()).ugt(K) &&
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 8509e930ba74ae..3dde5c1c8a40c1 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -560,7 +560,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435455,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -646,7 +646,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -735,7 +735,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435455,42949672]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1041,7 +1041,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm2
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,0,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,1,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
@@ -1135,21 +1135,21 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783]
; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT: por %xmm5, %xmm4
-; CHECK-SSE41-NEXT: pminud %xmm4, %xmm3
-; CHECK-SSE41-NEXT: pcmpeqd %xmm4, %xmm3
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: por %xmm4, %xmm3
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2
+; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2
; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
;
@@ -1157,17 +1157,16 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
-; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
-; CHECK-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1179,12 +1178,11 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm4
+; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2
-; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
@@ -1196,15 +1194,14 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-LABEL: test_srem_even_INT_MIN:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT: vpminud %xmm3, %xmm2, %xmm3
-; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
+; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1
+; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: retq
%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>
@@ -1263,7 +1260,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: por %xmm4, %xmm3
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,1,42949672]
; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2
; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2
; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1362,7 +1359,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm1
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1447,7 +1444,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435455,306783378]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1536,7 +1533,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,42949672]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1844,7 +1841,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435455,4294967295,858993458]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -1932,7 +1929,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2016,7 +2013,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: por %xmm2, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435455,4294967295,42949672]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2093,7 +2090,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2166,7 +2163,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: psrlq $32, %xmm0
; CHECK-SSE41-NEXT: por %xmm1, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435455,4294967295]
; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -2237,7 +2234,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
; CHECK-SSE2-NEXT: psrlw $8, %xmm6
; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7
; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
@@ -2264,7 +2261,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: psrlw $8, %xmm0
; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1
; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3
@@ -2300,7 +2297,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
; CHECK-SSE41-NEXT: psrlw $8, %xmm6
; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0
; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0
; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7
@@ -2326,7 +2323,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: psrlw $8, %xmm0
; CHECK-SSE41-NEXT: packuswb %xmm4, %xmm0
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
; CHECK-SSE41-NEXT: pmaxub %xmm0, %xmm4
; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm4
; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3
|
I should add that I have exhaustively tested the new derivation at W=32 for all values of N and all D=2^K. |
a power of two. The expansion previously used, derived from Hacker's Delight, does not work correctly when the dividend is INT_MIN and the divisor is a power of two. We now use an alternate derivation of the A and Q constants specifically for the power-of-two divisor case to avoid this problem. Credit to Fabian Giesen for the new derivation. I have exhaustively tested the new derivation at W=32 for all values of N and all D=2^K. Fixes llvm#77169
of non-splat vector SREM expansion when we aren't hitting the special case. Fixes llvm/llvm-project#84830 Introduced in llvm/llvm-project#82706
The expansion previously used, derived from Hacker's Delight,
does not work correctly when the dividend is INT_MIN and the
divisor is a power of two. We now use an alternate derivation
of the A and Q constants specifically for the power-of-two divisor
case to avoid this problem. Credit to Fabian Giesen for the
new derivation.
Fixes #77169