-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Increase max VF if vectorized function variants exist #66639
Conversation
If there are function calls in the candidate loop and we have vectorized variants available, try some wider VFs in case the conservative initial maximum based on the widest types in the loop won't actually allow us to make use of those function variants.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers ChangesIf there are function calls in the candidate loop and we have vectorized Full diff: https://github.com/llvm/llvm-project/pull/66639.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 20cfc680e8f90b3..a97cc014a3dae45 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -380,6 +380,10 @@ class LoopVectorizationLegality {
return MaskedOp.contains(I);
}
+ /// Returns true if there is at least one function call in the loop which
+ /// has a vectorized variant available.
+ bool hasVectorVariants() const { return VecVariantsFound; }
+
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
@@ -538,6 +542,12 @@ class LoopVectorizationLegality {
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
+
+ /// If we discover function calls within the loop which have a valid
+ /// vectorized variant, record that fact so that LoopVectorize can
+ /// (potentially) make a better decision on the maximum VF and enable
+ /// the use of those function variants.
+ bool VecVariantsFound = false;
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 35d69df56dc7220..d8d44b4cf96b601 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,6 +78,11 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
"Scalable vectorization is available and favored when the "
"cost is inconclusive.")));
+static cl::opt<bool> UseWiderVFIfVariantsPresent(
+ "vectorizer-maximize-bandwidth-if-variant-present", cl::init(true),
+ cl::Hidden,
+ cl::desc("Try wider VFs if they enable the use of vector variants"));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -943,6 +948,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
}
+ // If we found a vectorized variant of a function, note that so LV can
+ // make better decisions about maximum VF.
+ if (CI && !VFDatabase::getMappings(*CI).empty() &&
+ UseWiderVFIfVariantsPresent)
+ VecVariantsFound = true;
+
// Check that the instruction return type is vectorizable.
// Also, we can't vectorize extractelement instructions.
if ((!VectorType::isValidElementType(I.getType()) &&
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a203c4794eac943..e8842fcf56da49a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5152,7 +5152,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
: TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
- TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+ (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+ Legal->hasVectorVariants()))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
new file mode 100644
index 000000000000000..1d40a7c5fbe96b9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-if-variant-present=false -S | FileCheck %s --check-prefixes=NARROW
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
+; WIDE-LABEL: @test_widen(
+; WIDE-NEXT: entry:
+; WIDE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; WIDE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; WIDE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; WIDE: vector.ph:
+; WIDE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; WIDE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; WIDE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; WIDE-NEXT: br label [[VECTOR_BODY:%.*]]
+; WIDE: vector.body:
+; WIDE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; WIDE-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; WIDE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x ptr>, ptr [[TMP4]], align 8
+; WIDE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; WIDE-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; WIDE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; WIDE-NEXT: store <vscale x 4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; WIDE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; WIDE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; WIDE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; WIDE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; WIDE: middle.block:
+; WIDE-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; WIDE: scalar.ph:
+; WIDE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; WIDE-NEXT: br label [[FOR_BODY:%.*]]
+; WIDE: for.body:
+; WIDE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; WIDE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
+; WIDE-NEXT: [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
+; WIDE-NEXT: [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
+; WIDE-NEXT: [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR3:[0-9]+]]
+; WIDE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; WIDE-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; WIDE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; WIDE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; WIDE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; WIDE: for.cond.cleanup:
+; WIDE-NEXT: ret void
+;
+; NARROW-LABEL: @test_widen(
+; NARROW-NEXT: entry:
+; NARROW-NEXT: br label [[FOR_BODY:%.*]]
+; NARROW: for.body:
+; NARROW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NARROW-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; NARROW-NEXT: [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
+; NARROW-NEXT: [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
+; NARROW-NEXT: [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR1:[0-9]+]]
+; NARROW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; NARROW-NEXT: store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; NARROW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NARROW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; NARROW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; NARROW: for.cond.cleanup:
+; NARROW-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gep = getelementptr i64, ptr %b, i64 %indvars.iv
+ %load = load ptr, ptr %gep
+ %load2 = load i32, ptr %load
+ %call = call i32 @foo(i32 %load2) #0
+ %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %call, ptr %arrayidx
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1025
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+declare i32 @foo(i32)
+declare <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
+attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }
|
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Outdated
Show resolved
Hide resolved
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
Outdated
Show resolved
Hide resolved
@@ -390,6 +390,11 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor( | |||
cl::desc( | |||
"Override cost based safe divisor widening for div/rem instructions")); | |||
|
|||
static cl::opt<bool> UseWiderVFIfVariantsPresent( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry if I missed this before, but can you also rename this to something a bit more specific, i.e. UseWiderVFIfCallVariantsPresent
? At the moment it could refer to variants of anything. Other than that the patch looks good!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Thanks @huntergr-arm.
If there are function calls in the candidate loop and we have vectorized variants available, try some wider VFs in case the conservative initial maximum based on the widest types in the loop won't actually allow us to make use of those function variants.
If there are function calls in the candidate loop and we have vectorized
variants available, try some wider VFs in case the conservative initial
maximum based on the widest types in the loop won't actually allow us
to make use of those function variants.