Skip to content

Commit

Permalink
[LV] Increase max VF if vectorized function variants exist (#66639)
Browse files Browse the repository at this point in the history
If there are function calls in the candidate loop and we have vectorized
variants available, try some wider VFs in case the conservative initial
maximum based on the widest types in the loop won't actually allow us
to make use of those function variants.
  • Loading branch information
huntergr-arm committed Nov 13, 2023
1 parent f6f625f commit b070629
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 2 deletions.
10 changes: 10 additions & 0 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,10 @@ class LoopVectorizationLegality {
return MaskedOp.contains(I);
}

/// Returns true if there is at least one function call in the loop which
/// has a vectorized variant available.
bool hasVectorCallVariants() const { return VecCallVariantsFound; }

unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }

Expand Down Expand Up @@ -538,6 +542,12 @@ class LoopVectorizationLegality {
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;

/// If we discover function calls within the loop which have a valid
/// vectorized variant, record that fact so that LoopVectorize can
/// (potentially) make a better decision on the maximum VF and enable
/// the use of those function variants.
bool VecCallVariantsFound = false;
};

} // namespace llvm
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
}

// If we found a vectorized variant of a function, note that so LV can
// make better decisions about maximum VF.
if (CI && !VFDatabase::getMappings(*CI).empty())
VecCallVariantsFound = true;

// Check that the instruction return type is vectorizable.
// Also, we can't vectorize extractelement instructions.
if ((!VectorType::isValidElementType(I.getType()) &&
Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,11 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
cl::desc(
"Override cost based safe divisor widening for div/rem instructions"));

static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
"vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
cl::Hidden,
cl::desc("Try wider VFs if they enable the use of vector variants"));

/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type.
Expand Down Expand Up @@ -5095,8 +5100,10 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
TTI.shouldMaximizeVectorBandwidth(RegKind))) {
if (MaximizeBandwidth ||
(MaximizeBandwidth.getNumOccurrences() == 0 &&
(TTI.shouldMaximizeVectorBandwidth(RegKind) ||
(UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
Expand Down
115 changes: 115 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW

target triple = "aarch64-unknown-linux-gnu"

define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
; WIDE-LABEL: @test_widen(
; WIDE-NEXT: entry:
; WIDE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; WIDE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; WIDE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
; WIDE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; WIDE: vector.ph:
; WIDE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; WIDE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; WIDE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
; WIDE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
; WIDE-NEXT: br label [[VECTOR_BODY:%.*]]
; WIDE: vector.body:
; WIDE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; WIDE-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
; WIDE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP4]], align 8
; WIDE-NEXT: [[TMP5:%.*]] = fptrunc <vscale x 4 x double> [[WIDE_LOAD]] to <vscale x 4 x float>
; WIDE-NEXT: [[TMP6:%.*]] = call <vscale x 4 x float> @foo_vector(<vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
; WIDE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
; WIDE-NEXT: store <vscale x 4 x float> [[TMP6]], ptr [[TMP7]], align 4
; WIDE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; WIDE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
; WIDE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
; WIDE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; WIDE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; WIDE: middle.block:
; WIDE-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; WIDE: scalar.ph:
; WIDE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; WIDE-NEXT: br label [[FOR_BODY:%.*]]
; WIDE: for.body:
; WIDE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; WIDE-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
; WIDE-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
; WIDE-NEXT: [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
; WIDE-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR2:[0-9]+]]
; WIDE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
; WIDE-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4
; WIDE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; WIDE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
; WIDE-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; WIDE: for.cond.cleanup:
; WIDE-NEXT: ret void
;
; NARROW-LABEL: @test_widen(
; NARROW-NEXT: entry:
; NARROW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; NARROW: vector.ph:
; NARROW-NEXT: br label [[VECTOR_BODY:%.*]]
; NARROW: vector.body:
; NARROW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; NARROW-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
; NARROW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
; NARROW-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
; NARROW-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
; NARROW-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
; NARROW-NEXT: [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
; NARROW-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
; NARROW-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
; NARROW-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
; NARROW-NEXT: store <2 x float> [[TMP7]], ptr [[TMP8]], align 4
; NARROW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; NARROW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; NARROW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; NARROW: middle.block:
; NARROW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; NARROW: scalar.ph:
; NARROW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; NARROW-NEXT: br label [[FOR_BODY:%.*]]
; NARROW: for.body:
; NARROW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; NARROW-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
; NARROW-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
; NARROW-NEXT: [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
; NARROW-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]]
; NARROW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
; NARROW-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4
; NARROW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NARROW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
; NARROW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; NARROW: for.cond.cleanup:
; NARROW-NEXT: ret void
;
entry:
br label %for.body

for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%gep = getelementptr double, ptr %b, i64 %indvars.iv
%load = load double, ptr %gep
%trunc = fptrunc double %load to float
%call = call float @foo(float %trunc) #0
%arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
store float %call, ptr %arrayidx
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1025
br i1 %exitcond, label %for.cond.cleanup, label %for.body

for.cond.cleanup:
ret void
}

declare float @foo(float)
declare <vscale x 4 x float> @foo_vector(<vscale x 4 x float>, <vscale x 4 x i1>)

attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }

0 comments on commit b070629

Please sign in to comment.