-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[profcheck] Add heuristical profile metadata for lowering table-based cttz. #161898
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[profcheck] Add heuristical profile metadata for lowering table-based cttz. #161898
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Jin Huang (jinhuang1102) ChangesWhen lowering a This PR adds heuristic branch weights to the Patch is 20.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161898.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 805bdb41737c1..64ff8a2c2b1df 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -28,8 +28,10 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -599,6 +601,11 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
+ if (Instruction *SelectI = dyn_cast<Instruction>(Select))
+ SelectI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(SelectI->getContext()).createBranchWeights(1, 100));
+
// NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
// it should be handled as: `cttz(x) & (typeSize - 1)`.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index bb3001e909b8c..7e648d6ee6bb5 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -91,12 +91,13 @@
@ctz7.table = internal unnamed_addr constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
-define i32 @ctz1(i32 %x) {
+define i32 @ctz1(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz1(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -113,12 +114,13 @@ entry:
ret i32 %conv
}
-define i32 @ctz1_nusw(i32 %x) {
+define i32 @ctz1_nusw(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz1_nusw(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -137,8 +139,9 @@ entry:
@ctz2.table = internal unnamed_addr constant [64 x i16] [i16 32, i16 0, i16 1, i16 12, i16 2, i16 6, i16 0, i16 13, i16 3, i16 0, i16 7, i16 0, i16 0, i16 0, i16 0, i16 14, i16 10, i16 4, i16 0, i16 0, i16 8, i16 0, i16 0, i16 25, i16 0, i16 0, i16 0, i16 0, i16 0, i16 21, i16 27, i16 15, i16 31, i16 11, i16 5, i16 0, i16 0, i16 0, i16 0, i16 0, i16 9, i16 0, i16 0, i16 24, i16 0, i16 0, i16 20, i16 26, i16 30, i16 0, i16 0, i16 0, i16 0, i16 23, i16 0, i16 19, i16 29, i16 0, i16 22, i16 18, i16 28, i16 17, i16 16, i16 0], align 2
-define i32 @ctz2(i32 %x) {
+define i32 @ctz2(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz2(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i16
@@ -159,11 +162,12 @@ entry:
@ctz3.table = internal unnamed_addr constant [32 x i32] [i32 0, i32 1, i32 2, i32 24, i32 3, i32 19, i32 6, i32 25, i32 22, i32 4, i32 20, i32 10, i32 16, i32 7, i32 12, i32 26, i32 31, i32 23, i32 18, i32 5, i32 21, i32 9, i32 15, i32 11, i32 30, i32 17, i32 8, i32 14, i32 29, i32 13, i32 28, i32 27], align 4
-define i32 @ctz3(i32 %x) {
+define i32 @ctz3(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz3(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]], !prof [[PROF_2:![0-9]+]]
; CHECK: if.end:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X]], i1 true)
; CHECK-NEXT: br label [[RETURN]]
@@ -190,11 +194,12 @@ return: ; preds = %entry, %if.end
ret i32 %retval.0
}
-define i32 @ctz3_with_i8gep(i32 %x) {
+define i32 @ctz3_with_i8gep(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz3_with_i8gep(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]], !prof [[PROF_2]]
; CHECK: if.end:
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[X]], i1 true)
; CHECK-NEXT: br label [[RETURN]]
@@ -225,12 +230,13 @@ return: ; preds = %if.end, %entry
@table = internal unnamed_addr constant [64 x i32] [i32 0, i32 1, i32 12, i32 2, i32 13, i32 22, i32 17, i32 3, i32 14, i32 33, i32 23, i32 36, i32 18, i32 58, i32 28, i32 4, i32 62, i32 15, i32 34, i32 26, i32 24, i32 48, i32 50, i32 37, i32 19, i32 55, i32 59, i32 52, i32 29, i32 44, i32 39, i32 5, i32 63, i32 11, i32 21, i32 16, i32 32, i32 35, i32 57, i32 27, i32 61, i32 25, i32 47, i32 49, i32 54, i32 51, i32 43, i32 38, i32 10, i32 20, i32 31, i32 56, i32 60, i32 46, i32 53, i32 42, i32 9, i32 30, i32 45, i32 41, i32 8, i32 40, i32 7, i32 6], align 4
-define i32 @ctz4(i64 %b) {
+define i32 @ctz4(i64 %b) !prof !0 {
; CHECK-LABEL: @ctz4(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.cttz.i64(i64 [[B:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[B]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; CHECK-NEXT: ret i32 [[TMP3]]
;
@@ -246,12 +252,13 @@ entry:
@ctz5.table = internal unnamed_addr constant [32 x i8] c"\00\01\02\18\03\13\06\19\16\04\14\0A\10\07\0C\1A\1F\17\12\05\15\09\0F\0B\1E\11\08\0E\1D\0D\1C\1B", align 1
-define i32 @ctz5(i32 %x) {
+define i32 @ctz5(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz5(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -270,12 +277,13 @@ entry:
@ctz6.table = constant [64 x i32] [i32 63, i32 0, i32 58, i32 1, i32 59, i32 47, i32 53, i32 2, i32 60, i32 39, i32 48, i32 27, i32 54, i32 33, i32 42, i32 3, i32 61, i32 51, i32 37, i32 40, i32 49, i32 18, i32 28, i32 20, i32 55, i32 30, i32 34, i32 11, i32 43, i32 14, i32 22, i32 4, i32 62, i32 57, i32 46, i32 52, i32 38, i32 26, i32 32, i32 41, i32 50, i32 36, i32 17, i32 19, i32 29, i32 10, i32 13, i32 21, i32 56, i32 45, i32 25, i32 31, i32 35, i32 16, i32 9, i32 12, i32 44, i32 24, i32 15, i32 8, i32 23, i32 7, i32 6, i32 5], align 4
-define i32 @ctz6(i64 %n) {
+define i32 @ctz6(i64 %n) !prof !0 {
; CHECK-LABEL: @ctz6(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.cttz.i64(i64 [[N:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 63, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 63, i64 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; CHECK-NEXT: ret i32 [[TMP3]]
;
@@ -291,12 +299,13 @@ entry:
@ctz8.table = internal unnamed_addr constant [32 x i32] [i32 31, i32 0, i32 1, i32 23, i32 2, i32 18, i32 5, i32 24, i32 21, i32 3, i32 19, i32 9, i32 15, i32 6, i32 11, i32 25, i32 30, i32 22, i32 17, i32 4, i32 20, i32 8, i32 14, i32 10, i32 29, i32 16, i32 7, i32 13, i32 28, i32 12, i32 27, i32 26], align 4
-define i32 @ctz8(i32 %v) {
+define i32 @ctz8(i32 %v) !prof !0 {
; CHECK-LABEL: @ctz8(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[V:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[V]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 31, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 31, i32 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: ret i32 [[TMP2]]
;
entry:
@@ -312,12 +321,13 @@ entry:
;; This has a wrong table size but is otherwise fine.
@ctz9.table = internal unnamed_addr constant [128 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
-define i32 @ctz9(i32 %x) {
+define i32 @ctz9(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz9(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -334,12 +344,13 @@ entry:
ret i32 %conv
}
-define i32 @ctz1_with_i8_gep(i32 %x) {
+define i32 @ctz1_with_i8_gep(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz1_with_i8_gep(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -357,8 +368,9 @@ entry:
}
; This is the same a ctz2 (i16 table) with an i8 gep making the indices invalid
-define i32 @ctz2_with_i8_gep(i32 %x) {
+define i32 @ctz2_with_i8_gep(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz2_with_i8_gep(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[X:%.*]]
; CHECK-NEXT: [[AND:%.*]] = and i32 [[SUB]], [[X]]
@@ -383,8 +395,9 @@ entry:
}
; This is the same a ctz2_with_i8_gep but with the gep index multiplied by 2.
-define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
+define i32 @ctz2_with_i8_gep_fixed(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz2_with_i8_gep_fixed(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32
@@ -404,12 +417,13 @@ define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
; This is a i16 input with the debruijn table stored in a single i128.
@tablei128 = internal unnamed_addr constant i128 16018378897745984667142067713738932480, align 16
-define i32 @cttz_i16_via_i128(i16 noundef %x) {
+define i32 @cttz_i16_via_i128(i16 noundef %x) !prof !0 {
; CHECK-LABEL: @cttz_i16_via_i128(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i16 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP1:%.*]] = trunc i16 [[TMP2]] to i8
; CHECK-NEXT: [[CONV6:%.*]] = zext i8 [[TMP1]] to i32
; CHECK-NEXT: ret i32 [[CONV6]]
@@ -428,8 +442,9 @@ entry:
; Same as above but the table is a little off
@tablei128b = internal unnamed_addr constant i128 16018378897745984667142068813250560256, align 16
-define i32 @cttz_i16_via_i128_incorrecttable(i16 noundef %x) {
+define i32 @cttz_i16_via_i128_incorrecttable(i16 noundef %x) !prof !0 {
; CHECK-LABEL: @cttz_i16_via_i128_incorrecttable(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[SUB:%.*]] = sub i16 0, [[X:%.*]]
; CHECK-NEXT: [[AND:%.*]] = and i16 [[X]], [[SUB]]
@@ -455,12 +470,13 @@ entry:
; Same as ctz1 but the table and load is very large
@ctz7i128.table = internal unnamed_addr constant [32 x i128] [i128 0, i128 1, i128 28, i128 2, i128 29, i128 14, i128 24, i128 3, i128 30, i128 22, i128 20, i128 15, i128 25, i128 17, i128 4, i128 8, i128 31, i128 27, i128 13, i128 23, i128 21, i128 19, i128 16, i128 7, i128 26, i128 12, i128 18, i128 6, i128 11, i128 5, i128 10, i128 9], align 16
-define i128 @ctz1_i128(i32 %x) {
+define i128 @ctz1_i128(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz1_i128(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i128
; CHECK-NEXT: ret i128 [[TMP3]]
;
@@ -477,12 +493,13 @@ entry:
; This is roughly the same as ctz1 but using i128.
@table.i128 = internal unnamed_addr constant [128 x i8] c"\00\01e\02tf<\03|ug^R=!\04}yvWoh_5ZSE>0\22\14\05~rzPwmX.pkiI`K6\1Ab[TBMF?'81*#\1C\15\0E\06\7Fds;{]Q xVn4YD/\13qOl-jHJ\19aAL&7)\1B\0Dc:\\\1FU3C\12N,G\18@%(\0C9\1E2\11+\17$\0B\1D\10\16\0A\0F\09\08\07", align 1
-define i32 @src(i128 noundef %x) {
+define i32 @src(i128 noundef %x) !prof !0 {
; CHECK-LABEL: @src(
+; CHECK: !prof [[PROF_0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP3:%.*]] = call i128 @llvm.cttz.i128(i128 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i128 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i128 0, i128 [[TMP3]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i128 0, i128 [[TMP3]], !prof [[PROF_1]]
; CHECK-NEXT: [[TMP0:%.*]] = trunc i128 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -498,3 +515,8 @@ entry:
%conv = zext i8 %0 to i32
ret i32 %conv
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 100}
+; CHECK: [[PROF_2]] = !{!"branch_weights", i32 3, i32 5}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll
index d2ecb57d94f99..f9c83b14c97a2 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-dereferencing-pointer.ll
@@ -20,13 +20,14 @@
@table = internal unnamed_addr constant [64 x i32] [i32 0, i32 1, i32 12, i32 2, i32 13, i32 22, i32 17, i32 3, i32 14, i32 33, i32 23, i32 36, i32 18, i32 58, i32 28, i32 4, i32 62, i32 15, i32 34, i32 26, i32 24, i32 48, i32 50, i32 37, i32 19, i32 55, i32 59, i32 52, i32 29, i32 44, i32 39, i32 5, i32 63, i32 11, i32 21, i32 16, i32 32, i32 35, i32 57, i32 27, i32 61, i32 25, i32 47, i32 49, i32 54, i32 51, i32 43, i32 38, i32 10, i32 20, i32 31, i32 56, i32 60, i32 46, i32 53, i32 42, i32 9, i32 30, i32 45, i32 41, i32 8, i32 40, i32 7, i32 6], align 4
-define i32 @ctz6(ptr nocapture readonly %b) {
+define i32 @ctz6(ptr nocapture readonly %b) !prof !0 {
; CHECK-LABEL: @ctz6(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true)
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP0]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[TMP4]]
;
@@ -40,3 +41,7 @@ entry:
%1 = load i32, ptr %arrayidx, align 4
ret i32 %1
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 100}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
index f63badb9f0a91..5f68fc0f2a6d2 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
@@ -20,13 +20,14 @@
@.str = private constant [3 x i8] c"%u\00", align 1
@test.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
-define i32 @test() {
+define i32 @test() !prof !0 {
; CHECK-LABEL: @test(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @x, align 4
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0]], i1 true)
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -43,3 +44,7 @@ entry:
%conv = zext i8 %1 to i32
ret i32 %conv
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 100}
\ No newline at end of file
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll
index bbdd9b7cef102..734bcd9b65b95 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-zero-element.ll
@@ -3,12 +3,13 @@
@ctz1.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
-define i32 @ctz1(i32 %x) {
+define i32 @ctz1(i32 %x) !prof !0 {
; CHECK-LABEL: @ctz1(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]], !prof [[PROF_1:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[CONV]]...
[truncated]
|
llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
Outdated
Show resolved
Hide resolved
llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-non-argument-value.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
Show resolved
Hide resolved
e2a8987
to
bf000ab
Compare
bf000ab
to
7c7cca3
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
7c7cca3
to
87142eb
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm, just a nit
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the 100:1 ratio derived from something other than intuition? It's probably fine, especially since most passes don't make use of exact numerical information, but having a (documented) basis for this would be good, especially if someone wants to follow up in the future.
87142eb
to
1a30c5b
Compare
1a30c5b
to
e814843
Compare
llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
Outdated
Show resolved
Hide resolved
llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
Outdated
Show resolved
Hide resolved
// Attach heuristic branch weigths to the newly 'select' instruction that | ||
// handles the cttz(0) edge case The assumpltion is tht the input to a cttz | ||
// operation is rarely 0, so we add a strong 100-to-1 bias weights to the | ||
// 'false' path. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment mentioning that the 100-1 is arbitrary (or where it comes from if not)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added. Thx!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see where this has been added. Is it "heuristic" in the first line/arbitrarily down below?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW, we also have createUnlikelyBranchWeights() as the standard way to indicate an unlikely branch, though the values it uses are a lot more aggressive than 1:100.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should use that then - besides the precedent, it's also self-documenting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW in C++ the [[likely]]
attribute produces branch weights of 2000 and 1 (but I agree createUnlkielyBranchWeights()
is better should this change in the future).
(EDIT: fix godbolt link)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see where this has been added. Is it "heuristic" in the first line/arbitrarily down below?
Yes, the 100 to 1 is an arbitrarily bias weights, just indicating that cttz(0)
is a rare case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW, we also have createUnlikelyBranchWeights() as the standard way to indicate an unlikely branch, though the values it uses are a lot more aggressive than 1:100.
Yes, createUnlikelyBranchWeights() provides a more aggressive bias weights (1 vs. 1,048,575[1]).
[1] (1U << 20) - 1
: shift 1 left 10 bit, then - 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should use that then - besides the precedent, it's also self-documenting.
Sure! Replaced by the self-documenting createUnlikelyBranchWeights()
9005f80
to
0f37ed9
Compare
auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz); | ||
|
||
// Attach heuristic branch weigths to the newly 'select' instruction that | ||
// handles the cttz(0) edge case The assumption is tht the input to a cttz |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: s/tht/that
auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0)); | ||
auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz); | ||
|
||
// Attach heuristic branch weigths to the newly 'select' instruction that |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: s/weigths/weights
// Attach heuristic branch weigths to the newly 'select' instruction that | ||
// handles the cttz(0) edge case The assumpltion is tht the input to a cttz | ||
// operation is rarely 0, so we add a strong 100-to-1 bias weights to the | ||
// 'false' path. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see where this has been added. Is it "heuristic" in the first line/arbitrarily down below?
45e768f
to
4f63034
Compare
auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz); | ||
|
||
// The true branch of select handles the cttz(0) case, which is extremely | ||
// rare. Use the standard createUnlikelyBranchWeights() helper to provide a |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: You can probably remove the second sentence given createUnlikelyBranchWeights
should be self documenting.
ed9e4c0
to
95aaabd
Compare
95aaabd
to
56bb387
Compare
… cttz. (#161898) When lowering a `table-based cttz` calculation to the `llvm.cttz` intrinsic, `AggressiveInstCombine` was not attaching profile metadata to the newly generated `select` instruction. This PR adds heuristic branch weights to the `select`. It uses a strong 100-to-1 probability favoring the `cttz` path over the zero-input case. This allows later passes to optimize code layout and branch prediction.
… cttz. (llvm#161898) When lowering a `table-based cttz` calculation to the `llvm.cttz` intrinsic, `AggressiveInstCombine` was not attaching profile metadata to the newly generated `select` instruction. This PR adds heuristic branch weights to the `select`. It uses a strong 100-to-1 probability favoring the `cttz` path over the zero-input case. This allows later passes to optimize code layout and branch prediction.
When lowering a
table-based cttz
calculation to thellvm.cttz
intrinsic,AggressiveInstCombine
was not attaching profile metadata to the newly generatedselect
instruction.This PR adds heuristic branch weights to the
select
. It uses a strong 100-to-1 probability favoring thecttz
path over the zero-input case. This allows later passes to optimize code layout and branch prediction.