[SVE][LoopVectorize] Support for vectorization of loops with function…

… calls Changes `getScalarizationOverhead` to return an invalid cost for scalable VFs and adds some simple tests for loops containing a function for which there is a vectorized variant available. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D96356
llvm · Feb 12, 2021 · fea06ef · fea06ef
1 parent 79b1b4a
commit fea06ef
Show file tree

Hide file tree

Showing 3 changed files with 115 additions and 7 deletions.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -82,7 +82,6 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
                                                  ElementCount Factor)
     : RetTy(CI.getType()), IID(Id), VF(Factor) {
 
-  assert(!Factor.isScalable() && "Scalable vectors are not yet supported");
   if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
     FMF = FPMO->getFastMathFlags();
 

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3753,7 +3753,6 @@ static void cse(BasicBlock *BB) {
 InstructionCost
 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
                                               bool &NeedToScalarize) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
@@ -4967,10 +4966,8 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
     if (UseVectorIntrinsic) {
       // Use vector version of the intrinsic.
       Type *TysForDecl[] = {CI->getType()};
-      if (VF.isVector()) {
-        assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+      if (VF.isVector())
         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-      }
       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
       assert(VectorF && "Can't retrieve vector intrinsic.");
     } else {
@@ -7042,8 +7039,9 @@ InstructionCost
 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
                                                      ElementCount VF) {
 
-  assert(!VF.isScalable() &&
-         "cannot compute scalarization overhead for scalable vectorization");
+  if (VF.isScalable())
+    return InstructionCost::getInvalid();
+
   if (VF.isScalar())
     return 0;
 

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -0,0 +1,111 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) {
+; CHECK-LABEL: @vec_load
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
+; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> %[[LOAD]])
+entry:
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %iv
+  %0 = load double, double* %arrayidx, align 8
+  %1 = call double @foo(double %0) #0
+  %add = fadd double %1, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %iv
+  store double %add, double* %arrayidx2, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+define void @vec_scalar(i64 %N, double* nocapture %a) {
+; CHECK-LABEL: @vec_scalar
+; CHECK: vector.body:
+; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+01, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+entry:
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %0 = call double @foo(double 10.0) #0
+  %sub = fsub double %0, 1.000000e+00
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %iv
+  store double %sub, double* %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+define void @vec_ptr(i64 %N, i64* noalias %a, i64** readnone %b) {
+; CHECK-LABEL: @vec_ptr
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x i64*>, <vscale x 2 x i64*>*
+; CHECK: call <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*> %[[LOAD]])
+entry:
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep = getelementptr i64*, i64** %b, i64 %iv
+  %load = load i64*, i64** %gep
+  %call = call i64 @bar(i64* %load) #1
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv
+  store i64 %call, i64* %arrayidx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
+; CHECK-LABEL: @vec_intrinsic
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
+; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
+entry:
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %iv
+  %0 = load double, double* %arrayidx, align 8
+  %1 = call fast double @llvm.sin.f64(double %0) #2
+  %add = fadd fast double %1, 1.000000e+00
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+declare double @foo(double)
+declare i64 @bar(i64*)
+declare double @llvm.sin.f64(double)
+
+declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>)
+declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*>)
+declare <vscale x 2 x double> @sin_vec(<vscale x 2 x double>)
+
+attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" }
+attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" }
+attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec)" }
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}