diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 576a7e8d43e8d1..3b15f6379211a0 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -262,10 +262,22 @@ TransformationMode hasLICMVersioningTransformation(Loop *L); void addStringMetadataToLoop(Loop *TheLoop, const char *MDString, unsigned V = 0); -/// Get a loop's estimated trip count based on branch weight metadata. +/// Returns a loop's estimated trip count based on branch weight metadata. +/// In addition if \p EstimatedLoopInvocationWeight is not null it is +/// initialized with weight of loop's latch leading to the exit. /// Returns 0 when the count is estimated to be 0, or None when a meaningful /// estimate can not be made. -Optional getLoopEstimatedTripCount(Loop *L); +Optional +getLoopEstimatedTripCount(Loop *L, + unsigned *EstimatedLoopInvocationWeight = nullptr); + +/// Set a loop's branch weight metadata to reflect that loop has \p +/// EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight exits +/// through latch. Returns true if metadata is successfully updated, false +/// otherwise. Note that loop must have a latch block which controls loop exit +/// in order to succeed. +bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, + unsigned EstimatedLoopInvocationWeight); /// Check inner loop (L) backedge count is known to be invariant on all /// iterations of its outer loop. If the loop has no parent, this is trivially @@ -370,6 +382,23 @@ int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector &DeadInsts); +/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for +/// \p OrigLoop and the following distribution of \p OrigLoop iteration among \p +/// UnrolledLoop and \p RemainderLoop. \p UnrolledLoop receives weights that +/// reflect TC/UF iterations, and \p RemainderLoop receives weights that reflect +/// the remaining TC%UF iterations. +/// +/// Note that \p OrigLoop may be equal to either \p UnrolledLoop or \p +/// RemainderLoop in which case weights for \p OrigLoop are updated accordingly. +/// Note also behavior is undefined if \p UnrolledLoop and \p RemainderLoop are +/// equal. \p UF must be greater than zero. +/// If \p OrigLoop has no profile info associated nothing happens. +/// +/// This utility may be useful for such optimizations as unroller and +/// vectorizer as it's typical transformation for them. +void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, + Loop *RemainderLoop, uint64_t UF); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index c9de4340cb2984..88b0f8eff27b84 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -690,17 +691,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, } } -Optional llvm::getLoopEstimatedTripCount(Loop *L) { - // Support loops with an exiting latch and other existing exists only - // deoptimize. - - // Get the branch weights for the loop's backedge. +/// Checks if \p L has single exit through latch block except possibly +/// "deoptimizing" exits. Returns branch instruction terminating the loop +/// latch if above check is successful, nullptr otherwise. +static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) { BasicBlock *Latch = L->getLoopLatch(); if (!Latch) - return None; + return nullptr; + BranchInst *LatchBR = dyn_cast(Latch->getTerminator()); if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) - return None; + return nullptr; assert((LatchBR->getSuccessor(0) == L->getHeader() || LatchBR->getSuccessor(1) == L->getHeader()) && @@ -711,21 +712,36 @@ Optional llvm::getLoopEstimatedTripCount(Loop *L) { if (any_of(ExitBlocks, [](const BasicBlock *EB) { return !EB->getTerminatingDeoptimizeCall(); })) + return nullptr; + + return LatchBR; +} + +Optional +llvm::getLoopEstimatedTripCount(Loop *L, + unsigned *EstimatedLoopInvocationWeight) { + // Support loops with an exiting latch and other existing exists only + // deoptimize. + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) return None; // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. uint64_t BackedgeTakenWeight, LatchExitWeight; - if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) + if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) return None; - if (LatchBR->getSuccessor(0) != L->getHeader()) + if (LatchBranch->getSuccessor(0) != L->getHeader()) std::swap(BackedgeTakenWeight, LatchExitWeight); if (!LatchExitWeight) return None; + if (EstimatedLoopInvocationWeight) + *EstimatedLoopInvocationWeight = LatchExitWeight; + // Estimated backedge taken count is a ratio of the backedge taken weight by // the weight of the edge exiting the loop, rounded to nearest. uint64_t BackedgeTakenCount = @@ -734,6 +750,37 @@ Optional llvm::getLoopEstimatedTripCount(Loop *L) { return BackedgeTakenCount + 1; } +bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, + unsigned EstimatedloopInvocationWeight) { + // Support loops with an exiting latch and other existing exists only + // deoptimize. + BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); + if (!LatchBranch) + return false; + + // Calculate taken and exit weights. + unsigned LatchExitWeight = 0; + unsigned BackedgeTakenWeight = 0; + + if (EstimatedTripCount > 0) { + LatchExitWeight = EstimatedloopInvocationWeight; + BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight; + } + + // Make a swap if back edge is taken when condition is "false". + if (LatchBranch->getSuccessor(0) != L->getHeader()) + std::swap(BackedgeTakenWeight, LatchExitWeight); + + MDBuilder MDB(LatchBranch->getContext()); + + // Set/Update profile metadata. + LatchBranch->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight)); + + return true; +} + bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, ScalarEvolution &SE) { Loop *OuterL = InnerLoop->getParentLoop(); @@ -1351,3 +1398,29 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, Rewriter.clearInsertPoint(); return NumReplaced; } + +/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for +/// \p OrigLoop. +void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, + Loop *RemainderLoop, uint64_t UF) { + assert(UF > 0 && "Zero unrolled factor is not supported"); + assert(UnrolledLoop != RemainderLoop && + "Unrolled and Remainder loops are expected to distinct"); + + // Get number of iterations in the original scalar loop. + unsigned OrigLoopInvocationWeight = 0; + Optional OrigAverageTripCount = + getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight); + if (!OrigAverageTripCount) + return; + + // Calculate number of iterations in unrolled loop. + unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF; + // Calculate number of iterations for remainder loop. + unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF; + + setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount, + OrigLoopInvocationWeight); + setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount, + OrigLoopInvocationWeight); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index af42e00f0b74b8..b1650713d5462d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3483,6 +3483,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // Remove redundant induction instructions. cse(LoopVectorBody); + + // Set/update profile weights for the vector and remainder loops as original + // loop iterations are now distributed among them. Note that original loop + // represented by LoopScalarBody becomes remainder loop after vectorization. + // + // For cases like foldTailByMasking() and requiresScalarEpiloque() we may + // end up getting slightly roughened result but that should be OK since + // profile is not inherently precise anyway. Note also possible bypass of + // vector code caused by legality checks is ignored, assigning all the weight + // to the vector loop, optimistically. + setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), + LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll new file mode 100644 index 00000000000000..50b64d86c2303d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="print,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s +; RUN: opt -passes="print,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s | FileCheck %s -check-prefix=CHECK-MASKED + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@a = dso_local global [1024 x i32] zeroinitializer, align 16 +@b = dso_local global [1024 x i32] zeroinitializer, align 16 + +; Check correctness of profile info for vectorization without epilog. +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @_Z3foov() local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3foov( +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]], +; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]: +; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]], +; CHECK-MASKED: [[FOR_BODY:for\.body]]: +; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]], +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %1 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2 + %add = add nsw i32 %2, %mul + store i32 %add, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6 +} + +; Check correctness of profile info for vectorization with epilog. +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @_Z3foo2v() local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3foo2v( +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]], +; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]: +; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]], +; CHECK-MASKED: [[FOR_BODY:for\.body]]: +; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]], +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %1 = trunc i64 %indvars.iv to i32 + %mul = mul nsw i32 %0, %1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv + %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2 + %add = add nsw i32 %2, %mul + store i32 %add, i32* %arrayidx2, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1027 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7 +} + +attributes #0 = { "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255} +; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0} +; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63} +; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C++ TBAA"} +!6 = !{!"branch_weights", i32 1, i32 1023} +!7 = !{!"branch_weights", i32 1, i32 1026} diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll index 56f8b3e83c7dc8..e19f4aa85c02c4 100644 --- a/llvm/test/Transforms/LoopVectorize/tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll @@ -61,8 +61,10 @@ define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 { ; but has a high trip count per invocation. Vectorize it. ; CHECK-LABEL: @foo_low_trip_count3( -; CHECK: vector.body: - +; CHECK: [[VECTOR_BODY:vector\.body]]: +; CHECK: br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]], +; CHECK: [[FOR_BODY:for\.body]]: +; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]], entry: br i1 %cond, label %for.preheader, label %for.end, !prof !2 @@ -205,6 +207,15 @@ for.end: ; preds = %for.body ret i32 0 } +; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490} +; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0} +; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000, +; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001. +; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1 +; for vectorized and remainder loops, respectively, therefore their +; estimatedBackedgeTakenCounts are 249 and 0, and so the weights recorded with +; loop invocation weights of 10 are the above {10, 2490} and {10, 0}. + !0 = !{!"function_entry_count", i64 100} !1 = !{!"branch_weights", i32 100, i32 0} !2 = !{!"branch_weights", i32 10, i32 90}