Skip to content

Conversation

@mtrofin
Copy link
Member

@mtrofin mtrofin commented Nov 11, 2025

Propagate branch weights in mergeComparisons​ : the probability of reaching the common "exit" BB (bb_phi​ in the description in processPhi​)doesn't change, and is a disjunction over the probabilities of doing that from the blocks performing comparisons which are now being merged

Issue #147390

Copy link
Member Author

mtrofin commented Nov 11, 2025

@mtrofin mtrofin marked this pull request as ready for review November 11, 2025 22:44
@llvmbot
Copy link
Member

llvmbot commented Nov 11, 2025

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-ir

Author: Mircea Trofin (mtrofin)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/167594.diff

5 Files Affected:

  • (modified) llvm/include/llvm/IR/ProfDataUtils.h (+3)
  • (modified) llvm/lib/IR/ProfDataUtils.cpp (+1-1)
  • (modified) llvm/lib/Transforms/Scalar/MergeICmps.cpp (+26-2)
  • (modified) llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll (+25-12)
  • (modified) llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll (+21-7)
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index fade7a2dbac2b..50c16a5791e70 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -148,6 +148,9 @@ LLVM_ABI bool extractProfTotalWeight(const Instruction &I,
 LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
                                bool IsExpected, bool ElideAllZero = false);
 
+/// Push the weights right to fit in uint32_t.
+LLVM_ABI SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights);
+
 /// Variant of `setBranchWeights` where the `Weights` will be fit first to
 /// uint32_t by shifting right.
 LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef<uint64_t> Weights,
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 94dbe1f3988b8..d1ada00fb1f64 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -86,7 +86,7 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
 }
 
 /// Push the weights right to fit in uint32_t.
-static SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights) {
+SmallVector<uint32_t> llvm::fitWeights(ArrayRef<uint64_t> Weights) {
   SmallVector<uint32_t> Ret;
   Ret.reserve(Weights.size());
   uint64_t Max = *llvm::max_element(Weights);
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index f273e9d90e71e..083aeb188620f 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -50,8 +50,9 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
@@ -66,6 +67,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mergeicmps"
 
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // namespace llvm
 namespace {
 
 // A BCE atom "Binary Compare Expression Atom" represents an integer load
@@ -647,6 +651,8 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
     ToSplit->split(BB, AA);
   }
 
+  SmallVector<uint32_t, 2> BranchWeights;
+  bool HasBranchWeights = false;
   if (Comparisons.size() == 1) {
     LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
     // Use clone to keep the metadata
@@ -656,6 +662,8 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
     RhsLoad->replaceUsesOfWith(RhsLoad->getOperand(0), Rhs);
     // There are no blocks to merge, just do the comparison.
     IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
+    HasBranchWeights =
+        extractBranchWeights(*FirstCmp.BB->getTerminator(), BranchWeights);
   } else {
     const unsigned TotalSizeBits = std::accumulate(
         Comparisons.begin(), Comparisons.end(), 0u,
@@ -673,6 +681,20 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
         Builder, DL, &TLI);
     IsEqual = Builder.CreateICmpEQ(
         MemCmpCall, ConstantInt::get(Builder.getIntNTy(IntBits), 0));
+    SmallVector<uint64_t, 2> Weights(2);
+    Weights[1] = 1;
+    for (const auto &C : Comparisons) {
+      SmallVector<uint32_t, 2> W;
+      if (!extractBranchWeights(*C.BB->getTerminator(), W)) {
+        HasBranchWeights = false;
+        break;
+      }
+      std::swap(W[0], W[1]);
+      Weights = getDisjunctionWeights(Weights, W);
+    }
+    std::swap(Weights[0], Weights[1]);
+    BranchWeights = fitWeights(Weights);
+    HasBranchWeights = true;
   }
 
   BasicBlock *const PhiBB = Phi.getParent();
@@ -684,7 +706,9 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
     DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}});
   } else {
     // Continue to next block if equal, exit to phi else.
-    Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
+    auto *BI = Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
+    if (!ProfcheckDisableMetadataFixes && HasBranchWeights)
+      setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
     Phi.addIncoming(ConstantInt::getFalse(Context), BB);
     DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock},
                       {DominatorTree::Insert, BB, PhiBB}});
diff --git a/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll b/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
index f4bc96db86146..f55dbe678582f 100644
--- a/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s --check-prefix=X86
 
 %S = type { i32, i32, i32, i32, i32}
@@ -15,7 +15,7 @@ define zeroext i1 @opeq1(
 ; X86-NEXT:    ret i1 [[TMP2]]
 ;
   ptr nocapture readonly dereferenceable(16) %a,
-  ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync {
+  ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !2 {
 
 entry:
   %ptr = alloca i32
@@ -24,7 +24,7 @@ entry:
   ; Does other work, has no interference, merge block
   store i32 42, ptr %ptr
   %cmp.i = icmp eq i32 %0, %1
-  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !3
 
 land.rhs.i:
   %second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1
@@ -32,7 +32,7 @@ land.rhs.i:
   %second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1
   %3 = load i32, ptr %second2.i, align 4
   %cmp2.i = icmp eq i32 %2, %3
-  br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
+  br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit, !prof !4
 
 land.rhs.i.2:
   %third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2
@@ -40,7 +40,7 @@ land.rhs.i.2:
   %third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
   %5 = load i32, ptr %third2.i, align 4
   %cmp3.i = icmp eq i32 %4, %5
-  br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
+  br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit, !prof !5
 
 land.rhs.i.3:
   %fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
@@ -55,15 +55,15 @@ opeq1.exit:
   ret i1 %8
 }
 
-define zeroext i1 @part_sequent_eq_with_metadata() {
+define zeroext i1 @part_sequent_eq_with_metadata() !prof !2 {
 ; X86-LABEL: @part_sequent_eq_with_metadata(
 ; X86-NEXT:  bb01:
 ; X86-NEXT:    [[A:%.*]] = alloca [[S:%.*]], align 8
 ; X86-NEXT:    [[B:%.*]] = alloca [[S]], align 8
-; X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG0:![0-9]+]], !noundef !1
-; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG0]], !noundef !1
+; X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG1:![0-9]+]], !noundef [[META2:![0-9]+]]
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG1]], !noundef [[META2]]
 ; X86-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-; X86-NEXT:    br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]]
+; X86-NEXT:    br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]], !prof [[PROF3:![0-9]+]]
 ; X86:       "bb1+bb2+bb3":
 ; X86-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 2
 ; X86-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 2
@@ -80,7 +80,7 @@ bb0:
   %value0 = load i32, ptr %a, align 4, !range !0, !noundef !1
   %value1 = load i32, ptr %b, align 4, !range !0, !noundef !1
   %cmp.i = icmp eq i32 %value0, %value1
-  br i1 %cmp.i, label %bb1, label %exit
+  br i1 %cmp.i, label %bb1, label %exit, !prof !3
 
 bb1:
   %second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2
@@ -88,7 +88,7 @@ bb1:
   %second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
   %value3 = load i32, ptr %second2.i, align 4
   %cmp2.i = icmp eq i32 %value2, %value3
-  br i1 %cmp2.i, label %bb2, label %exit
+  br i1 %cmp2.i, label %bb2, label %exit, !prof !4
 
 bb2:
   %third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
@@ -96,7 +96,7 @@ bb2:
   %third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 3
   %value5 = load i32, ptr %third2.i, align 4
   %cmp3.i = icmp eq i32 %value4, %value5
-  br i1 %cmp3.i, label %bb3, label %exit
+  br i1 %cmp3.i, label %bb3, label %exit, !prof !5
 
 bb3:
   %fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 4
@@ -113,3 +113,16 @@ exit:
 
 !0 = !{i32 0, i32 2}
 !1 = !{}
+!2 = !{!"function_entry_count", i32 100}
+!3 = !{!"branch_weights", i32 2, i32 3}
+!4 = !{!"branch_weights", i32 5, i32 7}
+!5 = !{!"branch_weights", i32 11, i32 13}
+;.
+; X86: attributes #[[ATTR0:[0-9]+]] = { nofree nosync }
+; X86: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) }
+;.
+; X86: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
+; X86: [[RNG1]] = !{i32 0, i32 2}
+; X86: [[META2]] = !{}
+; X86: [[PROF3]] = !{!"branch_weights", i32 2, i32 3}
+;.
diff --git a/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll b/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll
index bc6beefb2caee..720cbc459a284 100644
--- a/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=mergeicmps -verify-dom-info -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
 %S = type { i32, i32, i32, i32 }
@@ -15,11 +15,11 @@ define zeroext i1 @opeq1(
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       "land.rhs.i+land.rhs.i.2":
 ; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i64 8)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       land.rhs.i.31:
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 3
@@ -32,20 +32,20 @@ define zeroext i1 @opeq1(
 ; CHECK-NEXT:    ret i1 [[TMP11]]
 ;
   ptr nocapture readonly dereferenceable(16) %a,
-  ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync {
+  ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !0 {
 entry:
   %first.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
   %0 = load i32, ptr %first.i, align 4
   %first1.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
   %1 = load i32, ptr %first1.i, align 4
   %cmp.i = icmp eq i32 %0, %1
-  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !1
 
 land.rhs.i:
   %2 = load i32, ptr %a, align 4
   %3 = load i32, ptr %b, align 4
   %cmp3.i = icmp eq i32 %2, %3
-  br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit
+  br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit, !prof !2
 
 land.rhs.i.2:
   %third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1
@@ -53,7 +53,7 @@ land.rhs.i.2:
   %third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1
   %5 = load i32, ptr %third2.i, align 4
   %cmp4.i = icmp eq i32 %4, %5
-  br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit
+  br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit, !prof !3
 
 land.rhs.i.3:
   %fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
@@ -67,3 +67,17 @@ opeq1.exit:
   %8 = phi i1 [ false, %entry ], [ false,  %land.rhs.i], [ false, %land.rhs.i.2 ], [ %cmp5.i, %land.rhs.i.3 ]
   ret i1 %8
 }
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 2, i32 3}
+!2 = !{!"branch_weights", i32 5, i32 7}
+!3 = !{!"branch_weights", i32 11, i32 13}
+!4 = !{!"branch_weights", i32 17, i32 19}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 55, i32 233}
+;.

@mtrofin mtrofin force-pushed the users/mtrofin/11-11-_mergeicmp_profcheck_propagate_profile_info branch from 3affc72 to 2026110 Compare November 11, 2025 22:48
@mtrofin mtrofin force-pushed the users/mtrofin/11-11-_mergeicmp_profcheck_propagate_profile_info branch from 2026110 to e5f8a76 Compare November 12, 2025 01:02
@mtrofin mtrofin force-pushed the users/mtrofin/11-11-_nfc_generalize_the_arithmetic_type_for_getdisjunctionweights_ branch from 49b9823 to 967055b Compare November 12, 2025 01:02
Base automatically changed from users/mtrofin/11-11-_nfc_generalize_the_arithmetic_type_for_getdisjunctionweights_ to main November 12, 2025 01:45
@mtrofin mtrofin force-pushed the users/mtrofin/11-11-_mergeicmp_profcheck_propagate_profile_info branch 4 times, most recently from c1e4550 to 7e5054e Compare November 13, 2025 22:33
; X86-NEXT: [[B:%.*]] = alloca [[S]], align 8
; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG0:![0-9]+]], !noundef !1
; X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG0]], !noundef !1
; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG1:![0-9]+]], !noundef [[META2:![0-9]+]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the rename RNG0 -> RNG1 ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

UTC did that. Probably because we now pass --check-globals and some metadata is now apparent to UTC, and occurs earlier.

!1 = !{!"branch_weights", i32 2, i32 3}
!2 = !{!"branch_weights", i32 5, i32 7}
!3 = !{!"branch_weights", i32 11, i32 13}
!4 = !{!"branch_weights", i32 17, i32 19}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unused ?

@mtrofin mtrofin force-pushed the users/mtrofin/11-11-_mergeicmp_profcheck_propagate_profile_info branch from 7e5054e to 1f12bc6 Compare November 14, 2025 15:42
@mtrofin mtrofin force-pushed the users/mtrofin/11-11-_mergeicmp_profcheck_propagate_profile_info branch from 1f12bc6 to 22d3e3b Compare November 14, 2025 15:55
Copy link
Member Author

mtrofin commented Nov 14, 2025

Merge activity

  • Nov 14, 5:28 PM UTC: A user started a stack merge that includes this pull request via Graphite.
  • Nov 14, 5:29 PM UTC: @mtrofin merged this pull request with Graphite.

@mtrofin mtrofin merged commit 9ac84a6 into main Nov 14, 2025
10 checks passed
@mtrofin mtrofin deleted the users/mtrofin/11-11-_mergeicmp_profcheck_propagate_profile_info branch November 14, 2025 17:29
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants