-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[SLP]Do not vectorize small (<=2) buildvector/buildvalue sequences with MaxVF==true. #98957
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesIf MaxVFOnly for buildvector/buildvalue vectorization is set to true and the Full diff: https://github.com/llvm/llvm-project/pull/98957.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 74a16d3fbcad6..629ddeea1534a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -18254,6 +18254,14 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
return false;
+ if (MaxVFOnly && BuildVectorOpds.size() == 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
+ << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
+ "trying reduction first.";
+ });
+ return false;
+ }
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register.
return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
@@ -18270,6 +18278,14 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
isFixedVectorShuffle(BuildVectorOpds, Mask)))
return false;
+ if (MaxVFOnly && BuildVectorInsts.size() == 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
+ << "Cannot SLP vectorize list: only 2 elements of buildvector, "
+ "trying reduction first.";
+ });
+ return false;
+ }
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index a24cb81541d7c..c6209fd71063a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -10,33 +10,32 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
; CHECK: bb18:
-; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP8]], [[BB:%.*]] ]
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
-; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP10]], 2.000000e+00
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
-; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP11]], 3.000000e+00
+; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00
; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
; CHECK: bb25:
-; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP9]], [[BB18]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
; CHECK-NEXT: br label [[BB30:%.*]]
; CHECK: bb30:
; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP13]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT: [[TMP15:%.*]] = uitofp <4 x i8> [[TMP14]] to <4 x float>
-; CHECK-NEXT: [[TMP16:%.*]] = fsub fast <4 x float> [[TMP15]], [[TMP3]]
-; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <4 x float> [[TMP16]], [[TMP12]]
-; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP17]])
+; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
+; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
+; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]]
+; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
-; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP18]])
+; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]])
; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]])
; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]]
; CHECK: bb57:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index bf4903fd19b09..2f5936357d0f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -7,8 +7,7 @@
; YAML-NEXT: Name: NotPossible
; YAML-NEXT: Function: g
; YAML-NEXT: Args:
-; YAML-NEXT: - String: 'Cannot SLP vectorize list: vectorization was impossible'
-; YAML-NEXT: - String: ' with available vectorization factors'
+; YAML-NEXT: - String: 'Cannot SLP vectorize list: only 2 elements of buildvector, trying reduction first.'
define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
; CHECK-LABEL: @g(
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If MaxVFOnly for buildvector/buildvalue vectorization is set to true and the
total number of elements to vectorize is <= 2, better to try to
vectorize reductions at first, which may produce larger tree (reductions
have a limit of at least 4 elements to vectorize). Smaller
buildvector/buildvalue sequence will be attempted to vectorize later,
with MaxVFOnly set to false.