diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 20cfc680e8f90..a509ebf6a7e1b 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -380,6 +380,10 @@ class LoopVectorizationLegality { return MaskedOp.contains(I); } + /// Returns true if there is at least one function call in the loop which + /// has a vectorized variant available. + bool hasVectorCallVariants() const { return VecCallVariantsFound; } + unsigned getNumStores() const { return LAI->getNumStores(); } unsigned getNumLoads() const { return LAI->getNumLoads(); } @@ -538,6 +542,12 @@ class LoopVectorizationLegality { /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; + + /// If we discover function calls within the loop which have a valid + /// vectorized variant, record that fact so that LoopVectorize can + /// (potentially) make a better decision on the maximum VF and enable + /// the use of those function variants. + bool VecCallVariantsFound = false; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index ca3c8d524d1b5..37a356c43e29a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -943,6 +943,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } + // If we found a vectorized variant of a function, note that so LV can + // make better decisions about maximum VF. + if (CI && !VFDatabase::getMappings(*CI).empty()) + VecCallVariantsFound = true; + // Check that the instruction return type is vectorizable. // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(I.getType()) && diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 65f0662320de8..e9d0315d114f6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -391,6 +391,11 @@ static cl::opt ForceSafeDivisor( cl::desc( "Override cost based safe divisor widening for div/rem instructions")); +static cl::opt UseWiderVFIfCallVariantsPresent( + "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), + cl::Hidden, + cl::desc("Try wider VFs if they enable the use of vector variants")); + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -5095,8 +5100,10 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; - if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && - TTI.shouldMaximizeVectorBandwidth(RegKind))) { + if (MaximizeBandwidth || + (MaximizeBandwidth.getNumOccurrences() == 0 && + (TTI.shouldMaximizeVectorBandwidth(RegKind) || + (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { auto MaxVectorElementCountMaxBW = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll new file mode 100644 index 0000000000000..00e8881426fd8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE +; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW + +target triple = "aarch64-unknown-linux-gnu" + +define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { +; WIDE-LABEL: @test_widen( +; WIDE-NEXT: entry: +; WIDE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; WIDE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; WIDE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]] +; WIDE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; WIDE: vector.ph: +; WIDE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; WIDE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; WIDE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; WIDE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; WIDE-NEXT: br label [[VECTOR_BODY:%.*]] +; WIDE: vector.body: +; WIDE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; WIDE-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] +; WIDE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 +; WIDE-NEXT: [[TMP5:%.*]] = fptrunc [[WIDE_LOAD]] to +; WIDE-NEXT: [[TMP6:%.*]] = call @foo_vector( [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; WIDE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; WIDE-NEXT: store [[TMP6]], ptr [[TMP7]], align 4 +; WIDE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; WIDE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; WIDE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; WIDE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; WIDE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; WIDE: middle.block: +; WIDE-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; WIDE: scalar.ph: +; WIDE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; WIDE-NEXT: br label [[FOR_BODY:%.*]] +; WIDE: for.body: +; WIDE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; WIDE-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]] +; WIDE-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8 +; WIDE-NEXT: [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float +; WIDE-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR2:[0-9]+]] +; WIDE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; WIDE-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4 +; WIDE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; WIDE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 +; WIDE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; WIDE: for.cond.cleanup: +; WIDE-NEXT: ret void +; +; NARROW-LABEL: @test_widen( +; NARROW-NEXT: entry: +; NARROW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NARROW: vector.ph: +; NARROW-NEXT: br label [[VECTOR_BODY:%.*]] +; NARROW: vector.body: +; NARROW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NARROW-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] +; NARROW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; NARROW-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float> +; NARROW-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]] +; NARROW-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; NARROW-NEXT: [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]] +; NARROW-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 +; NARROW-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1 +; NARROW-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; NARROW-NEXT: store <2 x float> [[TMP7]], ptr [[TMP8]], align 4 +; NARROW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; NARROW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; NARROW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NARROW: middle.block: +; NARROW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; NARROW: scalar.ph: +; NARROW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NARROW-NEXT: br label [[FOR_BODY:%.*]] +; NARROW: for.body: +; NARROW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NARROW-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]] +; NARROW-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8 +; NARROW-NEXT: [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float +; NARROW-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]] +; NARROW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] +; NARROW-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4 +; NARROW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NARROW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 +; NARROW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NARROW: for.cond.cleanup: +; NARROW-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %gep = getelementptr double, ptr %b, i64 %indvars.iv + %load = load double, ptr %gep + %trunc = fptrunc double %load to float + %call = call float @foo(float %trunc) #0 + %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv + store float %call, ptr %arrayidx + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1025 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +declare float @foo(float) +declare @foo_vector(, ) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" } +attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }