diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 02aa21e33bf4f..9e65636865b2c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -10171,19 +10171,56 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { } } -static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, - VectorizationFactor &VF, Loop *L, - PredicatedScalarEvolution &PSE, - ScalarEpilogueLowering SEL, - std::optional VScale) { - InstructionCost CheckCost = Checks.getCost(); - if (!CheckCost.isValid()) +/// For loops with uncountable early exits, find the cost of doing work when +/// exiting the loop early, such as calculating the final exit values of +/// variables used outside the loop. +/// TODO: This is currently overly pessimistic because the loop may not take +/// the early exit, but better to keep this conservative for now. In future, +/// it might be possible to relax this by using branch probabilities. +static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, + VPlan &Plan, ElementCount VF) { + InstructionCost Cost = 0; + for (auto *ExitVPBB : Plan.getExitBlocks()) { + for (auto *PredVPBB : ExitVPBB->getPredecessors()) { + // If the predecessor is not the middle.block, then it must be the + // vector.early.exit block, which may contain work to calculate the exit + // values of variables used outside the loop. + if (PredVPBB != Plan.getMiddleBlock()) { + LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block " + << PredVPBB->getName() << ":\n"); + Cost += PredVPBB->cost(VF, CostCtx); + } + } + } + return Cost; +} + +/// This function determines whether or not it's still profitable to vectorize +/// the loop given the extra work we have to do outside of the loop: +/// 1. Perform the runtime checks before entering the loop to ensure it's safe +/// to vectorize. +/// 2. In the case of loops with uncountable early exits, we may have to do +/// extra work when exiting the loop early, such as calculating the final +/// exit values of variables used outside the loop. +static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, + VectorizationFactor &VF, Loop *L, + PredicatedScalarEvolution &PSE, + VPCostContext &CostCtx, VPlan &Plan, + ScalarEpilogueLowering SEL, + std::optional VScale) { + InstructionCost TotalCost = Checks.getCost(); + if (!TotalCost.isValid()) return false; + // Add on the cost of any work required in the vector early exit block, if + // one exists. + TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width); + // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. if (VF.Width.isScalar()) { - if (CheckCost > VectorizeMemoryCheckThreshold) { + // TODO: Should we rename VectorizeMemoryCheckThreshold? + if (TotalCost > VectorizeMemoryCheckThreshold) { LLVM_DEBUG( dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"); @@ -10209,7 +10246,9 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // The total cost of the vector loop is // RtC + VecC * (TC / VF) + EpiC // where - // * RtC is the cost of the generated runtime checks + // * RtC is the cost of the generated runtime checks plus the cost of + // performing any additional work in the vector.early.exit block for loops + // with uncountable early exits. // * VecC is the cost of a single vector iteration. // * TC is the actual trip count of the loop // * VF is the vectorization factor @@ -10227,7 +10266,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); - uint64_t RtC = *CheckCost.getValue(); + uint64_t RtC = *TotalCost.getValue(); uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); @@ -10555,8 +10594,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // iteration count is low. However, setting the epilogue policy to // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops // with runtime checks. It's more effective to let - // `areRuntimeChecksProfitable` determine if vectorization is beneficial - // for the loop. + // `isOutsideLoopWorkProfitable` determine if vectorization is + // beneficial for the loop. if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } else { @@ -10654,9 +10693,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), + CM, CM.CostKind); if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL, - CM.getVScaleForTuning())) { + !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, + LVP.getPlanFor(VF.Width), SEL, + CM.getVScaleForTuning())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a8b304271f0da..8da7ea3a8ca70 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -742,6 +742,18 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, return Ctx.TTI.getArithmeticReductionCost( Instruction::Or, cast(VecTy), std::nullopt, Ctx.CostKind); } + case VPInstruction::ExtractFirstActive: { + // Calculate the cost of determining the lane index. + auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF); + IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, + Type::getInt64Ty(Ctx.LLVMCtx), + {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); + InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); + // Add on the cost of extracting the element. + auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Cost + Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, + Ctx.CostKind); + } default: // TODO: Compute cost other VPInstructions once the legacy cost model has // been retired. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll new file mode 100644 index 0000000000000..55c6c43b6306a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; REQUIRES: asserts +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -disable-output \ +; RUN: -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK + +target triple = "aarch64-unknown-linux-gnu" + +declare void @init_mem(ptr, i64); + +define i64 @same_exit_block_pre_inc_use1_sve() #1 { +; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_sve' +; CHECK: LV: Selecting VF: vscale x 16 +; CHECK: Calculating cost of work in exit block vector.early.exit +; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active +; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active +; CHECK: LV: Minimum required TC for runtime checks to be profitable:32 +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %index2 = phi i64 [ %index2.next, %loop.inc ], [ 15, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %index2.next = add i64 %index2, 2 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %val1 = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + %val2 = phi i64 [ %index2, %loop ], [ 98, %loop.inc ] + %retval = add i64 %val1, %val2 + ret i64 %retval +} + +define i64 @same_exit_block_pre_inc_use1_nosve() { +; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_nosve' +; CHECK: LV: Selecting VF: 16 +; CHECK: Calculating cost of work in exit block vector.early.exit +; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active +; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active +; CHECK: LV: Minimum required TC for runtime checks to be profitable:176 +; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176) +; CHECK-NEXT: LV: Too many memory checks needed. +entry: + %p1 = alloca [1024 x i8] + %p2 = alloca [1024 x i8] + call void @init_mem(ptr %p1, i64 1024) + call void @init_mem(ptr %p2, i64 1024) + br label %loop + +loop: + %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ] + %index2 = phi i64 [ %index2.next, %loop.inc ], [ 15, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index + %ld1 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index + %ld2 = load i8, ptr %arrayidx1, align 1 + %cmp3 = icmp eq i8 %ld1, %ld2 + br i1 %cmp3, label %loop.inc, label %loop.end + +loop.inc: + %index.next = add i64 %index, 1 + %index2.next = add i64 %index2, 2 + %exitcond = icmp ne i64 %index.next, 67 + br i1 %exitcond, label %loop, label %loop.end + +loop.end: + %val1 = phi i64 [ %index, %loop ], [ 67, %loop.inc ] + %val2 = phi i64 [ %index2, %loop ], [ 98, %loop.inc ] + %retval = add i64 %val1, %val2 + ret i64 %retval +} + +attributes #1 = { "target-features"="+sve" vscale_range(1,16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index 5f926db1131f6..ab4738bf2901b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -274,6 +274,7 @@ define i64 @loop_contains_safe_div() #1 { ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP12]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()