diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5535cc55e9321..2057cab46135f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9722,7 +9722,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, } // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. - double ScalarC = *VF.ScalarCost.getValue(); + uint64_t ScalarC = *VF.ScalarCost.getValue(); if (ScalarC == 0) return true; @@ -9749,7 +9749,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC // // Now we can compute the minimum required trip count TC as - // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC // // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result @@ -9761,9 +9761,9 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, AssumedMinimumVscale = *VScale; IntVF *= AssumedMinimumVscale; } - double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; - double RtC = *CheckCost.getValue(); - double MinTC1 = RtC / (ScalarC - VecCOverVF); + uint64_t RtC = *CheckCost.getValue(); + uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); + uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); // Second, compute a minimum iteration count so that the cost of the // runtime checks is only a fraction of the total scalar loop cost. This @@ -9772,12 +9772,12 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // * TC. To bound the runtime check to be a fraction 1/X of the scalar // cost, compute // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC - double MinTC2 = RtC * 10 / ScalarC; + uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); // Now pick the larger minimum. If it is not a multiple of VF and a scalar // epilogue is allowed, choose the next closest multiple of VF. This should // partly compensate for ignoring the epilogue cost. - uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + uint64_t MinTC = std::max(MinTC1, MinTC2); if (SEL == CM_ScalarEpilogueAllowed) MinTC = alignTo(MinTC, IntVF); VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll index a90b38c6a9605..fe98ca167a089 100644 --- a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll @@ -9,6 +9,11 @@ ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=4 \ ; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP %s +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP-DEF %s + ; The target does not support predicated vectorization. define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-LABEL: @foo( @@ -80,6 +85,54 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP: for.cond.cleanup: ; NO-VP-NEXT: ret void ; +; NO-VP-DEF-LABEL: @foo( +; NO-VP-DEF-NEXT: entry: +; NO-VP-DEF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-DEF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP0]] +; NO-VP-DEF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-DEF: vector.ph: +; NO-VP-DEF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-DEF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP1]] +; NO-VP-DEF-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-DEF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-DEF-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-DEF: vector.body: +; NO-VP-DEF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-DEF-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-DEF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]] +; NO-VP-DEF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; NO-VP-DEF-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 +; NO-VP-DEF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]] +; NO-VP-DEF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; NO-VP-DEF-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 4 +; NO-VP-DEF-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_LOAD1]], [[WIDE_LOAD]] +; NO-VP-DEF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; NO-VP-DEF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; NO-VP-DEF-NEXT: store [[TMP8]], ptr [[TMP10]], align 4 +; NO-VP-DEF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; NO-VP-DEF-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-DEF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP-DEF: middle.block: +; NO-VP-DEF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-DEF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; NO-VP-DEF: scalar.ph: +; NO-VP-DEF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-DEF-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-DEF: for.body: +; NO-VP-DEF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-DEF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; NO-VP-DEF-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-DEF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; NO-VP-DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-DEF-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; NO-VP-DEF-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-DEF-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-DEF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-DEF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-DEF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-DEF: for.cond.cleanup: +; NO-VP-DEF-NEXT: ret void +; entry: br label %for.body