From 22d3e3b84d3efeef7e5fcc8e648883115312c9db Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 11 Nov 2025 14:43:18 -0800 Subject: [PATCH] [MergeICmp][profcheck] Propagate profile info --- llvm/include/llvm/IR/ProfDataUtils.h | 3 ++ llvm/lib/IR/ProfDataUtils.cpp | 2 +- llvm/lib/Transforms/Scalar/MergeICmps.cpp | 44 +++++++++++++++++-- .../MergeICmps/X86/alias-merge-blocks.ll | 37 +++++++++++----- .../MergeICmps/X86/entry-block-shuffled.ll | 27 +++++++++--- 5 files changed, 90 insertions(+), 23 deletions(-) diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h index f1c2f38c74afd..c8cfccbe61e90 100644 --- a/llvm/include/llvm/IR/ProfDataUtils.h +++ b/llvm/include/llvm/IR/ProfDataUtils.h @@ -149,6 +149,9 @@ LLVM_ABI bool extractProfTotalWeight(const Instruction &I, LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef Weights, bool IsExpected, bool ElideAllZero = false); +/// Push the weights right to fit in uint32_t. +LLVM_ABI SmallVector fitWeights(ArrayRef Weights); + /// Variant of `setBranchWeights` where the `Weights` will be fit first to /// uint32_t by shifting right. LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef Weights, diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp index 94dbe1f3988b8..d1ada00fb1f64 100644 --- a/llvm/lib/IR/ProfDataUtils.cpp +++ b/llvm/lib/IR/ProfDataUtils.cpp @@ -86,7 +86,7 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData, } /// Push the weights right to fit in uint32_t. -static SmallVector fitWeights(ArrayRef Weights) { +SmallVector llvm::fitWeights(ArrayRef Weights) { SmallVector Ret; Ret.reserve(Weights.size()); uint64_t Max = *llvm::max_element(Weights); diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp index f273e9d90e71e..d4358c1a6a599 100644 --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -50,8 +50,9 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" @@ -66,6 +67,9 @@ using namespace llvm; #define DEBUG_TYPE "mergeicmps" +namespace llvm { +extern cl::opt ProfcheckDisableMetadataFixes; +} // namespace llvm namespace { // A BCE atom "Binary Compare Expression Atom" represents an integer load @@ -607,6 +611,37 @@ class MergedBlockName { }; } // namespace +/// Determine the branch weights for the resulting conditional branch, resulting +/// after merging \p Comparisons. +static std::optional> +computeMergedBranchWeights(ArrayRef Comparisons) { + assert(!Comparisons.empty()); + if (ProfcheckDisableMetadataFixes) + return std::nullopt; + if (Comparisons.size() == 1) { + SmallVector Weights; + if (!extractBranchWeights(*Comparisons[0].BB->getTerminator(), Weights)) + return std::nullopt; + return Weights; + } + // The probability to go to the phi block is the disjunction of the + // probability to go to the phi block from the individual Comparisons. We'll + // swap the weights because `getDisjunctionWeights` computes the disjunction + // for the "true" branch, then swap back. + SmallVector Weights{0, 1}; + // At this point, Weights encodes "0-probability" for the "true" side. + for (const auto &C : Comparisons) { + SmallVector W; + if (!extractBranchWeights(*C.BB->getTerminator(), W)) + return std::nullopt; + + std::swap(W[0], W[1]); + Weights = getDisjunctionWeights(Weights, W); + } + std::swap(Weights[0], Weights[1]); + return fitWeights(Weights); +} + // Merges the given contiguous comparison blocks into one memcmp block. static BasicBlock *mergeComparisons(ArrayRef Comparisons, BasicBlock *const InsertBefore, @@ -640,7 +675,7 @@ static BasicBlock *mergeComparisons(ArrayRef Comparisons, // If there is one block that requires splitting, we do it now, i.e. // just before we know we will collapse the chain. The instructions // can be executed before any of the instructions in the chain. - const auto ToSplit = llvm::find_if( + const auto *ToSplit = llvm::find_if( Comparisons, [](const BCECmpBlock &B) { return B.RequireSplit; }); if (ToSplit != Comparisons.end()) { LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n"); @@ -655,6 +690,7 @@ static BasicBlock *mergeComparisons(ArrayRef Comparisons, LhsLoad->replaceUsesOfWith(LhsLoad->getOperand(0), Lhs); RhsLoad->replaceUsesOfWith(RhsLoad->getOperand(0), Rhs); // There are no blocks to merge, just do the comparison. + // If we condition on this IsEqual, we already have its probabilities. IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad); } else { const unsigned TotalSizeBits = std::accumulate( @@ -684,7 +720,9 @@ static BasicBlock *mergeComparisons(ArrayRef Comparisons, DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}}); } else { // Continue to next block if equal, exit to phi else. - Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB); + auto *BI = Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB); + if (auto BranchWeights = computeMergedBranchWeights(Comparisons)) + setBranchWeights(*BI, BranchWeights.value(), /*IsExpected=*/false); Phi.addIncoming(ConstantInt::getFalse(Context), BB); DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock}, {DominatorTree::Insert, BB, PhiBB}}); diff --git a/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll b/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll index f4bc96db86146..f55dbe678582f 100644 --- a/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll +++ b/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s --check-prefix=X86 %S = type { i32, i32, i32, i32, i32} @@ -15,7 +15,7 @@ define zeroext i1 @opeq1( ; X86-NEXT: ret i1 [[TMP2]] ; ptr nocapture readonly dereferenceable(16) %a, - ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync { + ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !2 { entry: %ptr = alloca i32 @@ -24,7 +24,7 @@ entry: ; Does other work, has no interference, merge block store i32 42, ptr %ptr %cmp.i = icmp eq i32 %0, %1 - br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !3 land.rhs.i: %second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1 @@ -32,7 +32,7 @@ land.rhs.i: %second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1 %3 = load i32, ptr %second2.i, align 4 %cmp2.i = icmp eq i32 %2, %3 - br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit + br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit, !prof !4 land.rhs.i.2: %third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2 @@ -40,7 +40,7 @@ land.rhs.i.2: %third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2 %5 = load i32, ptr %third2.i, align 4 %cmp3.i = icmp eq i32 %4, %5 - br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit + br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit, !prof !5 land.rhs.i.3: %fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3 @@ -55,15 +55,15 @@ opeq1.exit: ret i1 %8 } -define zeroext i1 @part_sequent_eq_with_metadata() { +define zeroext i1 @part_sequent_eq_with_metadata() !prof !2 { ; X86-LABEL: @part_sequent_eq_with_metadata( ; X86-NEXT: bb01: ; X86-NEXT: [[A:%.*]] = alloca [[S:%.*]], align 8 ; X86-NEXT: [[B:%.*]] = alloca [[S]], align 8 -; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG0:![0-9]+]], !noundef !1 -; X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG0]], !noundef !1 +; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG1:![0-9]+]], !noundef [[META2:![0-9]+]] +; X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG1]], !noundef [[META2]] ; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]] -; X86-NEXT: br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]] +; X86-NEXT: br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]], !prof [[PROF3:![0-9]+]] ; X86: "bb1+bb2+bb3": ; X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 2 ; X86-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 2 @@ -80,7 +80,7 @@ bb0: %value0 = load i32, ptr %a, align 4, !range !0, !noundef !1 %value1 = load i32, ptr %b, align 4, !range !0, !noundef !1 %cmp.i = icmp eq i32 %value0, %value1 - br i1 %cmp.i, label %bb1, label %exit + br i1 %cmp.i, label %bb1, label %exit, !prof !3 bb1: %second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2 @@ -88,7 +88,7 @@ bb1: %second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2 %value3 = load i32, ptr %second2.i, align 4 %cmp2.i = icmp eq i32 %value2, %value3 - br i1 %cmp2.i, label %bb2, label %exit + br i1 %cmp2.i, label %bb2, label %exit, !prof !4 bb2: %third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3 @@ -96,7 +96,7 @@ bb2: %third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 3 %value5 = load i32, ptr %third2.i, align 4 %cmp3.i = icmp eq i32 %value4, %value5 - br i1 %cmp3.i, label %bb3, label %exit + br i1 %cmp3.i, label %bb3, label %exit, !prof !5 bb3: %fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 4 @@ -113,3 +113,16 @@ exit: !0 = !{i32 0, i32 2} !1 = !{} +!2 = !{!"function_entry_count", i32 100} +!3 = !{!"branch_weights", i32 2, i32 3} +!4 = !{!"branch_weights", i32 5, i32 7} +!5 = !{!"branch_weights", i32 11, i32 13} +;. +; X86: attributes #[[ATTR0:[0-9]+]] = { nofree nosync } +; X86: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) } +;. +; X86: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100} +; X86: [[RNG1]] = !{i32 0, i32 2} +; X86: [[META2]] = !{} +; X86: [[PROF3]] = !{!"branch_weights", i32 2, i32 3} +;. diff --git a/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll b/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll index bc6beefb2caee..855b58992b255 100644 --- a/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll +++ b/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt < %s -passes=mergeicmps -verify-dom-info -mtriple=x86_64-unknown-unknown -S | FileCheck %s %S = type { i32, i32, i32, i32 } @@ -15,11 +15,11 @@ define zeroext i1 @opeq1( ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]] +; CHECK-NEXT: br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: "land.rhs.i+land.rhs.i.2": ; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i64 8) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[MEMCMP]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]] +; CHECK-NEXT: br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]], !prof [[PROF2:![0-9]+]] ; CHECK: land.rhs.i.31: ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 3 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 3 @@ -32,20 +32,20 @@ define zeroext i1 @opeq1( ; CHECK-NEXT: ret i1 [[TMP11]] ; ptr nocapture readonly dereferenceable(16) %a, - ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync { + ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !0 { entry: %first.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3 %0 = load i32, ptr %first.i, align 4 %first1.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2 %1 = load i32, ptr %first1.i, align 4 %cmp.i = icmp eq i32 %0, %1 - br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !1 land.rhs.i: %2 = load i32, ptr %a, align 4 %3 = load i32, ptr %b, align 4 %cmp3.i = icmp eq i32 %2, %3 - br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit + br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit, !prof !2 land.rhs.i.2: %third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1 @@ -53,7 +53,7 @@ land.rhs.i.2: %third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1 %5 = load i32, ptr %third2.i, align 4 %cmp4.i = icmp eq i32 %4, %5 - br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit + br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit, !prof !3 land.rhs.i.3: %fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3 @@ -67,3 +67,16 @@ opeq1.exit: %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i], [ false, %land.rhs.i.2 ], [ %cmp5.i, %land.rhs.i.3 ] ret i1 %8 } + +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 2, i32 3} +!2 = !{!"branch_weights", i32 5, i32 7} +!3 = !{!"branch_weights", i32 11, i32 13} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) } +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 55, i32 233} +;.