Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SLP]Improve final minbitwidth analysis attempt. #87786

Conversation

alexey-bataev
Copy link
Member

Added part for demanded bits analysis in the IsPotentiallyTruncated to
improve minbitwidth analysis final attempts.

Metric: size..text

Program size..text
results results0 diff
test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 43069.00 42973.00 -0.2%
test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 43066.00 42970.00 -0.2%

Extra trunc instructions are emitted to operate with <32 x i8> instead
of <32 x i16>, will be removed in the next patches.

Created using spr 1.3.5
@llvmbot
Copy link
Collaborator

llvmbot commented Apr 5, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Added part for demanded bits analysis in the IsPotentiallyTruncated to
improve minbitwidth analysis final attempts.

Metric: size..text

Program size..text
results results0 diff
test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 43069.00 42973.00 -0.2%
test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 43066.00 42970.00 -0.2%

Extra trunc instructions are emitted to operate with <32 x i8> instead
of <32 x i16>, will be removed in the next patches.


Full diff: https://github.com/llvm/llvm-project/pull/87786.diff

4 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+5)
  • (modified) llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll (+3-2)
  • (modified) llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll (+5-4)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll (+4-3)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bdd26acfd2f8b4..41e5188f28602f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14123,6 +14123,11 @@ bool BoUpSLP::collectValuesToDemote(
     unsigned BitWidth1 = OrigBitWidth - NumSignBits;
     if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
       ++BitWidth1;
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      APInt Mask = DB->getDemandedBits(I);
+      unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
+      BitWidth1 = std::min(BitWidth1, BitWidth2);
+    }
     BitWidth = std::max(BitWidth, BitWidth1);
     return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
   };
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 44542f32bf145d..d2711d0546c0ab 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -81,8 +81,9 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32>
+; CHECK-NEXT:    [[TMP57:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = add <4 x i32> [[TMP16]], [[TMP57]]
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
 ; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
 ; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
index 2d69c7c984dcd2..04d275742832ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/trunc-to-large-than-bw.ll
@@ -8,10 +8,11 @@ define i32 @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[TMP1]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP5]], i32 1)
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
index f4a471493f1b3f..55da3e5f9f37c6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
@@ -7,9 +7,10 @@ define void @t(i64 %v) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], <i32 5, i32 6, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i16> [[TMP2]], <i16 5, i16 6, i16 3, i16 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 65535
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr null, align 4
 ; CHECK-NEXT:    ret void

@alexey-bataev
Copy link
Member Author

Ping!

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@alexey-bataev alexey-bataev merged commit 01d9528 into main Apr 8, 2024
7 checks passed
@alexey-bataev alexey-bataev deleted the users/alexey-bataev/spr/slpimprove-final-minbitwidth-analysis-attempt branch April 8, 2024 19:54
@aeubanks
Copy link
Contributor

We have a test that started failing after this patch. I seem to have narrowed it down to the following diff after SLPVectorizer with and without this patch:

$ diff /tmp/good.ll /tmp/bad.ll 
992,1011c992,1009
<   %8 = zext <8 x i16> %0 to <8 x i32>
<   %9 = zext <8 x i16> %1 to <8 x i32>
<   %10 = sub nsw <8 x i32> %9, %8
<   %11 = add nsw <8 x i32> %10, <i32 3329, i32 3329, i32 3329, i32 3329, i32 3329, i32 3329, i32 3329, i32 3329>
<   %12 = insertelement <8 x i32> poison, i32 %zext795, i32 0
<   %13 = shufflevector <8 x i32> %12, <8 x i32> poison, <8 x i32> zeroinitializer
<   %14 = mul <8 x i32> %11, %13
<   %15 = zext <8 x i32> %14 to <8 x i64>
<   %16 = mul nuw nsw <8 x i64> %15, <i64 5039, i64 5039, i64 5039, i64 5039, i64 5039, i64 5039, i64 5039, i64 5039>
<   %17 = lshr <8 x i64> %16, <i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24>
<   %18 = trunc <8 x i64> %17 to <8 x i32>
<   %19 = mul <8 x i32> %18, <i32 62207, i32 62207, i32 62207, i32 62207, i32 62207, i32 62207, i32 62207, i32 62207>
<   %20 = add <8 x i32> %19, %14
<   %21 = trunc <8 x i32> %20 to <8 x i16>
<   %22 = add <8 x i16> %21, <i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329>
<   %23 = icmp slt <8 x i16> %22, zeroinitializer
<   %24 = select <8 x i1> %23, <8 x i16> %21, <8 x i16> zeroinitializer
<   %25 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %22, <8 x i16> zeroinitializer)
<   %26 = or <8 x i16> %24, %25
<   store <8 x i16> %26, ptr %getelementptr797, align 2
---
>   %8 = sub <8 x i16> %1, %0
>   %9 = add <8 x i16> %8, <i16 3329, i16 3329, i16 3329, i16 3329, i16 3329, i16 3329, i16 3329, i16 3329>
>   %10 = insertelement <8 x i32> poison, i32 %zext795, i32 0
>   %11 = shufflevector <8 x i32> %10, <8 x i32> poison, <8 x i32> zeroinitializer
>   %12 = trunc <8 x i32> %11 to <8 x i16>
>   %13 = mul <8 x i16> %9, %12
>   %14 = zext <8 x i16> %13 to <8 x i64>
>   %15 = mul nuw nsw <8 x i64> %14, <i64 5039, i64 5039, i64 5039, i64 5039, i64 5039, i64 5039, i64 5039, i64 5039>
>   %16 = lshr <8 x i64> %15, <i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24, i64 24>
>   %17 = trunc <8 x i64> %16 to <8 x i16>
>   %18 = mul <8 x i16> %17, <i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329>
>   %19 = add <8 x i16> %18, %13
>   %20 = add <8 x i16> %19, <i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329, i16 -3329>
>   %21 = icmp slt <8 x i16> %20, zeroinitializer
>   %22 = select <8 x i1> %21, <8 x i16> %19, <8 x i16> zeroinitializer
>   %23 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %20, <8 x i16> zeroinitializer)
>   %24 = or <8 x i16> %22, %23
>   store <8 x i16> %24, ptr %getelementptr797, align 2

but I'm trying to understand what's going wrong

@alexey-bataev
Copy link
Member Author

Would be good to get a reproducer for the investigation.

@aeubanks
Copy link
Contributor

c.ll.txt
is the full pre-SLP-vectorizer IR

@alexey-bataev
Copy link
Member Author

Thanks, I have the idea what's went wrong, will check tomorrow

@alexey-bataev
Copy link
Member Author

c.ll.txt is the full pre-SLP-vectorizer IR

Must be fixed in 74e07ab

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants