[LV] Increase max VF if vectorized function variants exist (#66639)

If there are function calls in the candidate loop and we have vectorized variants available, try some wider VFs in case the conservative initial maximum based on the widest types in the loop won't actually allow us to make use of those function variants.
llvm · Nov 13, 2023 · b070629 · b070629
1 parent f6f625f
commit b070629
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 2 deletions.
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -380,6 +380,10 @@ class LoopVectorizationLegality {
     return MaskedOp.contains(I);
   }
 
+  /// Returns true if there is at least one function call in the loop which
+  /// has a vectorized variant available.
+  bool hasVectorCallVariants() const { return VecCallVariantsFound; }
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -538,6 +542,12 @@ class LoopVectorizationLegality {
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
+
+  /// If we discover function calls within the loop which have a valid
+  /// vectorized variant, record that fact so that LoopVectorize can
+  /// (potentially) make a better decision on the maximum VF and enable
+  /// the use of those function variants.
+  bool VecCallVariantsFound = false;
 };
 
 } // namespace llvm

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -943,6 +943,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           }
       }
 
+      // If we found a vectorized variant of a function, note that so LV can
+      // make better decisions about maximum VF.
+      if (CI && !VFDatabase::getMappings(*CI).empty())
+        VecCallVariantsFound = true;
+
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(I.getType()) &&

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,6 +391,11 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
     cl::desc(
         "Override cost based safe divisor widening for div/rem instructions"));
 
+static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
+    "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
+    cl::Hidden,
+    cl::desc("Try wider VFs if they enable the use of vector variants"));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -5095,8 +5100,10 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
-  if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
-                            TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+  if (MaximizeBandwidth ||
+      (MaximizeBandwidth.getNumOccurrences() == 0 &&
+       (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+        (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
+; WIDE-LABEL: @test_widen(
+; WIDE-NEXT:  entry:
+; WIDE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; WIDE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; WIDE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; WIDE:       vector.ph:
+; WIDE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; WIDE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; WIDE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; WIDE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; WIDE:       vector.body:
+; WIDE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; WIDE-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP4]], align 8
+; WIDE-NEXT:    [[TMP5:%.*]] = fptrunc <vscale x 4 x double> [[WIDE_LOAD]] to <vscale x 4 x float>
+; WIDE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x float> @foo_vector(<vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; WIDE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    store <vscale x 4 x float> [[TMP6]], ptr [[TMP7]], align 4
+; WIDE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; WIDE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; WIDE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; WIDE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; WIDE:       middle.block:
+; WIDE-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; WIDE:       scalar.ph:
+; WIDE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; WIDE-NEXT:    br label [[FOR_BODY:%.*]]
+; WIDE:       for.body:
+; WIDE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; WIDE-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
+; WIDE-NEXT:    [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
+; WIDE-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR2:[0-9]+]]
+; WIDE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
+; WIDE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; WIDE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; WIDE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; WIDE:       for.cond.cleanup:
+; WIDE-NEXT:    ret void
+;
+; NARROW-LABEL: @test_widen(
+; NARROW-NEXT:  entry:
+; NARROW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NARROW:       vector.ph:
+; NARROW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NARROW:       vector.body:
+; NARROW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NARROW-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; NARROW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; NARROW-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
+; NARROW-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; NARROW-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NARROW-NEXT:    [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
+; NARROW-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
+; NARROW-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
+; NARROW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; NARROW-NEXT:    store <2 x float> [[TMP7]], ptr [[TMP8]], align 4
+; NARROW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; NARROW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; NARROW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NARROW:       middle.block:
+; NARROW-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NARROW:       scalar.ph:
+; NARROW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NARROW-NEXT:    br label [[FOR_BODY:%.*]]
+; NARROW:       for.body:
+; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NARROW-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
+; NARROW-NEXT:    [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
+; NARROW-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]]
+; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
+; NARROW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NARROW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NARROW:       for.cond.cleanup:
+; NARROW-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gep = getelementptr double, ptr %b, i64 %indvars.iv
+  %load = load double, ptr %gep
+  %trunc = fptrunc double %load to float
+  %call = call float @foo(float %trunc) #0
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  store float %call, ptr %arrayidx
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1025
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare float @foo(float)
+declare <vscale x 4 x float> @foo_vector(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
+attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }