-
Notifications
You must be signed in to change notification settings - Fork 15k
[AMDGPU] make AMDGPUUniformIntrinsicCombine a function pass #165265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] make AMDGPUUniformIntrinsicCombine a function pass #165265
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Pankaj Dwivedi (PankajDwivedi-25) ChangesThere has been an issue(using function analysis inside the module pass in OPM) integrating this pass into the LLC pipeline, which currently lacks NPM support. I tried finding a way to get the per-function analysis, but it seems that in OPM, we don't have that option. So the best approach would be to make it a function pass. Ref: #116953 Full diff: https://github.com/llvm/llvm-project/pull/165265.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ce2b4a5f6f2e9..cd8b2495a4250 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -562,9 +562,13 @@ class AMDGPURewriteAGPRCopyMFMAPass
void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &);
extern char &AMDGPURewriteAGPRCopyMFMALegacyID;
+void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &);
+extern char &AMDGPUUniformIntrinsicCombineLegacyPassID;
+FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass();
+
struct AMDGPUUniformIntrinsicCombinePass
: public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a6074eaf78fd0..bf6f1a9dbf576 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
-MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS
@@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a200de4e0..4cfbee9c9fb7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -618,6 +618,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -884,9 +885,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
-
- if (EnableUniformIntrinsicCombine)
- PM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerPeepholeEPCallback(
@@ -897,6 +895,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
+
+ if (EnableUniformIntrinsicCombine)
+ FPM.addPass(AMDGPUUniformIntrinsicCombinePass());
});
PB.registerCGSCCOptimizerLateEPCallback(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 50c78d8c67251..65e6ed9d1d428 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -16,12 +16,6 @@
/// uniformity. And every instruction that's downstream and cares about dynamic
/// uniformity must be convergent (and isel will introduce v_readfirstlane for
/// them if their operands can't be proven statically uniform).
-///
-/// This pass is implemented as a ModulePass because intrinsic declarations
-/// exist at the module scope, allowing us to skip processing entirely if no
-/// declarations are present and to traverse their user lists directly when
-/// they are. A FunctionPass would instead require scanning every instruction
-/// in every function to find relevant intrinsics, which is far less efficient.
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
Tracker[NotOp] = true; // NOT preserves uniformity
LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
ICmp->replaceAllUsesWith(NotOp);
- ICmp->eraseFromParent();
Changed = true;
} else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
// Case: (icmp ne %ballot, 0) -> %ballot_arg
LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
<< *Src << '\n');
ICmp->replaceAllUsesWith(Src);
- ICmp->eraseFromParent();
Changed = true;
}
}
@@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return false;
}
-/// Iterates over intrinsic declarations in the module to optimize their uses.
-static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+/// Iterates over intrinsic calls in the Function to optimize.
+static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
bool IsChanged = false;
ValueMap<const Value *, bool> Tracker;
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- for (Function &F : M) {
- switch (F.getIntrinsicID()) {
+ for (Instruction &I : make_early_inc_range(instructions(F))) {
+ auto *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
+ continue;
+
+ switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane:
@@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
default:
continue;
}
-
- for (User *U : make_early_inc_range(F.users())) {
- auto *II = cast<IntrinsicInst>(U);
- Function *ParentF = II->getFunction();
- const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
- IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
- }
+ IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
}
PreservedAnalyses
-AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!runUniformIntrinsicCombine(M, AM))
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ const auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
+ if (!runUniformIntrinsicCombine(F, UI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<UniformityInfoAnalysis>();
return PA;
}
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
+ initializeAMDGPUUniformIntrinsicCombineLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+private:
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+} // namespace
+
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+ AMDGPUUniformIntrinsicCombineLegacy::ID;
+
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ const UniformityInfo &UI =
+ getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ return runUniformIntrinsicCombine(F, UI);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+ "AMDGPU Uniform Intrinsic Combine", false, false)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE,
+ "AMDGPU Uniform Intrinsic Combine", false, false)
+
+FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+ return new AMDGPUUniformIntrinsicCombineLegacy();
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index 6c4f504f3456c..33ce278028bba 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1)
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1)
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
@@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
@@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]])
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index aa11574517520..a3e42e564376c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
+; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
; PASS-CHECK-NEXT: ret void
;
@@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]])
+; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
; PASS-CHECK-NEXT: ret void
;
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, except for the nit about eraseFromParent(). It will be good to keep the erase if that is legal. Also please wait a day for @arsenm to comment.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/94/builds/12173 Here is the relevant piece of the build log for the reference |
) There has been an issue(using function analysis inside the module pass in OPM) integrating this pass into the LLC pipeline, which currently lacks NPM support. I tried finding a way to get the per-function analysis, but it seems that in OPM, we don't have that option. So the best approach would be to make it a function pass. Ref: llvm#116953
There has been an issue(using function analysis inside the module pass in OPM) integrating this pass into the LLC pipeline, which currently lacks NPM support. I tried finding a way to get the per-function analysis, but it seems that in OPM, we don't have that option.
So the best approach would be to make it a function pass.
Ref: #116953