[AMDGPU] Report only local per-function resource usage when object linking is enabled#192594
Conversation
This stack of pull requests is managed by sgh. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesWith object linking the linker aggregates resource usage across TUs via In this mode, skip the per-callsite conservative bumps in Full diff: https://github.com/llvm/llvm-project/pull/192594.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 83012c4cb72f2..e3a85dea934ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCResourceInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -267,6 +268,31 @@ void MCResourceInfo::gatherResourceInfo(
LLVM_DEBUG(dbgs() << "MCResUse: Gathering resource information for "
<< FnSym->getName() << '\n');
+
+ auto SetToLocal = [&](int64_t Value, ResourceInfoKind RIK) {
+ MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
+ Sym->setVariableValue(MCConstantExpr::create(Value, OutContext));
+ };
+
+ // When link-time object linking is enabled, set all resource symbols to
+ // concrete local values.
+ if (AMDGPUTargetMachine::EnableObjectLinking) {
+ LLVM_DEBUG(dbgs() << "MCResUse: object linking enabled, no call-graph "
+ "propagation; emitting local resource values only\n");
+ SetToLocal(FRI.NumVGPR, RIK_NumVGPR);
+ SetToLocal(FRI.NumAGPR, RIK_NumAGPR);
+ SetToLocal(FRI.NumExplicitSGPR, RIK_NumSGPR);
+ SetToLocal(FRI.NumNamedBarrier, RIK_NumNamedBarrier);
+ SetToLocal(FRI.PrivateSegmentSize, RIK_PrivateSegSize);
+ SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+ SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+ SetToLocal(FRI.HasDynamicallySizedStack,
+ ResourceInfoKind::RIK_HasDynSizedStack);
+ SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+ SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+ return;
+ }
+
LLVM_DEBUG({
if (!FRI.Callees.empty()) {
dbgs() << "MCResUse: Callees:\n";
@@ -347,14 +373,6 @@ void MCResourceInfo::gatherResourceInfo(
Sym->setVariableValue(localConstExpr);
}
- auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
- MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
- LLVM_DEBUG(
- dbgs() << "MCResUse: " << Sym->getName() << ": Adding " << LocalValue
- << ", no further propagation as indirect callee found within\n");
- Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
- };
-
if (!FRI.HasIndirectCall) {
assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 5c1f59636446c..3cb063ef8e962 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -102,6 +102,10 @@ class MCResourceInfo {
/// transitive maximum or accumulative. For example, if A calls B and B's VGPR
/// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
/// functions with indirect calls should be assigned the module level maximum.
+ ///
+ /// When link-time object linking is enabled, skip all call-transitive
+ /// propagation and emit concrete per-function values for every resource
+ /// symbol. Cross-TU aggregation is then the linker's responsibility.
void gatherResourceInfo(
const MachineFunction &MF,
const AMDGPUResourceUsageAnalysisWrapperPass::FunctionResourceInfo &FRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 4e664e084fb88..51bebefed5aa7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUResourceUsageAnalysis.h"
#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -272,6 +273,15 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
Info.Callees.push_back(Callee);
bool IsIndirect = !Callee || Callee->isDeclaration();
+ Info.HasIndirectCall |= IsIndirect;
+
+ // In object linking mode the linker has the full cross-TU view. It
+ // propagates resource usage across both direct calls to external
+ // declarations and true indirect calls. Skip the compile-time
+ // conservative assumptions so that the locally emitted metadata
+ // describes this function's own usage only.
+ if (AMDGPUTargetMachine::EnableObjectLinking)
+ continue;
// FIXME: Call site could have norecurse on it
if (!Callee || !Callee->doesNotRecurse()) {
@@ -301,7 +311,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
Info.UsesVCC = true;
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
- Info.HasIndirectCall = true;
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-indirect.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-indirect.ll
index 9e18b79e20814..fb23f4bd8e933 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-indirect.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @my_kernel() {
; ASM-DAG: .amdgpu_typeid "vi"
; ASM-DAG: .end_amdgpu_info
; ASM-DAG: .amdgpu_info caller
-; ASM-DAG: .amdgpu_flags 14
+; ASM-DAG: .amdgpu_flags 2
; ASM-DAG: .amdgpu_num_vgpr {{[0-9]+}}
; ASM-DAG: .amdgpu_indirect_call "vi"
; ASM-DAG: .end_amdgpu_info
diff --git a/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll
new file mode 100644
index 0000000000000..a25f74c46e954
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=asm < %s | FileCheck %s --check-prefix=OL
+
+declare void @extern_callee()
+
+define void @calls_extern() {
+ call void @extern_callee()
+ ret void
+}
+
+define void @calls_indirect(ptr %fptr) {
+ call void %fptr()
+ ret void
+}
+
+define void @calls_local() {
+ ret void
+}
+
+define amdgpu_kernel void @my_kernel(ptr %fptr) {
+ call void @calls_extern()
+ call void @calls_indirect(ptr %fptr)
+ call void @calls_local()
+ ret void
+}
+
+; COM: Default mode: direct-to-extern triggers the conservative "unknown
+; COM: callee" path. Register/stack-size symbols include the module-level
+; COM: sinks; boolean flags are all forced to 1; HasIndirectCall is set too
+; COM: (IsIndirect covers calls to declarations).
+; DEFAULT: .set .Lcalls_extern.num_vgpr, max({{[0-9]+}}, amdgpu.max_num_vgpr)
+; DEFAULT: .set .Lcalls_extern.num_agpr, max({{[0-9]+}}, amdgpu.max_num_agpr)
+; DEFAULT: .set .Lcalls_extern.numbered_sgpr, max({{[0-9]+}}, amdgpu.max_num_sgpr)
+; DEFAULT: .set .Lcalls_extern.num_named_barrier, max({{[0-9]+}}, amdgpu.max_num_named_barrier)
+; DEFAULT: .set .Lcalls_extern.uses_vcc, 1
+; DEFAULT: .set .Lcalls_extern.uses_flat_scratch, 1
+; DEFAULT: .set .Lcalls_extern.has_dyn_sized_stack, 1
+; DEFAULT: .set .Lcalls_extern.has_recursion, 1
+; DEFAULT: .set .Lcalls_extern.has_indirect_call, 1
+
+; COM: Object linking: the same function reports only its own local usage.
+; COM: The sinks drop out of the register/stack-size expressions and the
+; COM: pessimized boolean flags collapse to the true local values (UsesVCC is
+; COM: still 1 here because the call-site lowering on gfx900 genuinely uses
+; COM: VCC).
+; OL: .set .Lcalls_extern.num_vgpr, {{[0-9]+}}
+; OL: .set .Lcalls_extern.num_agpr, {{[0-9]+}}
+; OL: .set .Lcalls_extern.numbered_sgpr, {{[0-9]+}}
+; OL: .set .Lcalls_extern.num_named_barrier, {{[0-9]+}}
+; OL: .set .Lcalls_extern.uses_vcc, 1
+; OL: .set .Lcalls_extern.uses_flat_scratch, 0
+; OL: .set .Lcalls_extern.has_dyn_sized_stack, 0
+; OL: .set .Lcalls_extern.has_recursion, 0
+; OL: .set .Lcalls_extern.has_indirect_call, 1
+
+; COM: True indirect call: identical contrast to the direct-to-extern case.
+; COM: In default mode all the flags are pessimized; with object linking only
+; COM: HasIndirectCall is preserved (the linker sees the call site's typeid
+; COM: and address-taken set and handles propagation).
+; DEFAULT: .set .Lcalls_indirect.uses_vcc, 1
+; DEFAULT: .set .Lcalls_indirect.uses_flat_scratch, 1
+; DEFAULT: .set .Lcalls_indirect.has_dyn_sized_stack, 1
+; DEFAULT: .set .Lcalls_indirect.has_recursion, 1
+; DEFAULT: .set .Lcalls_indirect.has_indirect_call, 1
+
+; OL: .set .Lcalls_indirect.uses_vcc, 1
+; OL: .set .Lcalls_indirect.uses_flat_scratch, 0
+; OL: .set .Lcalls_indirect.has_dyn_sized_stack, 0
+; OL: .set .Lcalls_indirect.has_recursion, 0
+; OL: .set .Lcalls_indirect.has_indirect_call, 1
+
+; COM: Baseline: a function that makes no calls outside itself reports the
+; COM: same all-zero local flags in both modes.
+; DEFAULT: .set .Lcalls_local.uses_vcc, 0
+; DEFAULT: .set .Lcalls_local.uses_flat_scratch, 0
+; DEFAULT: .set .Lcalls_local.has_dyn_sized_stack, 0
+; DEFAULT: .set .Lcalls_local.has_recursion, 0
+; DEFAULT: .set .Lcalls_local.has_indirect_call, 0
+
+; OL: .set .Lcalls_local.uses_vcc, 0
+; OL: .set .Lcalls_local.uses_flat_scratch, 0
+; OL: .set .Lcalls_local.has_dyn_sized_stack, 0
+; OL: .set .Lcalls_local.has_recursion, 0
+; OL: .set .Lcalls_local.has_indirect_call, 0
+
+; COM: Kernel side of the contrast. Default mode emits call-graph-propagation
+; COM: expressions (max()/or() over every callee's symbols) so the kernel
+; COM: picks up its callees' pessimized values; object linking emits concrete
+; COM: literals and leaves cross-TU aggregation to the linker.
+; DEFAULT: .set .Lmy_kernel.num_vgpr, max({{[0-9]+}}, .Lcalls_extern.num_vgpr, .Lcalls_indirect.num_vgpr, .Lcalls_local.num_vgpr)
+; DEFAULT: .set .Lmy_kernel.private_seg_size, {{[0-9]+}}+max(.Lcalls_extern.private_seg_size, .Lcalls_indirect.private_seg_size, .Lcalls_local.private_seg_size)
+; DEFAULT: .set .Lmy_kernel.uses_vcc, or({{[0-9]+}}, .Lcalls_extern.uses_vcc, .Lcalls_indirect.uses_vcc, .Lcalls_local.uses_vcc)
+; DEFAULT: .set .Lmy_kernel.uses_flat_scratch, or({{[0-9]+}}, .Lcalls_extern.uses_flat_scratch, .Lcalls_indirect.uses_flat_scratch, .Lcalls_local.uses_flat_scratch)
+; DEFAULT: .set .Lmy_kernel.has_dyn_sized_stack, or({{[0-9]+}}, .Lcalls_extern.has_dyn_sized_stack, .Lcalls_indirect.has_dyn_sized_stack, .Lcalls_local.has_dyn_sized_stack)
+; DEFAULT: .set .Lmy_kernel.has_recursion, or({{[0-9]+}}, .Lcalls_extern.has_recursion, .Lcalls_indirect.has_recursion, .Lcalls_local.has_recursion)
+; DEFAULT: .set .Lmy_kernel.has_indirect_call, or({{[0-9]+}}, .Lcalls_extern.has_indirect_call, .Lcalls_indirect.has_indirect_call, .Lcalls_local.has_indirect_call)
+
+; OL: .set .Lmy_kernel.num_vgpr, {{[0-9]+}}
+; OL: .set .Lmy_kernel.private_seg_size, {{[0-9]+}}
+; OL: .set .Lmy_kernel.uses_vcc, {{[01]}}
+; OL: .set .Lmy_kernel.uses_flat_scratch, {{[01]}}
+; OL: .set .Lmy_kernel.has_dyn_sized_stack, 0
+; OL: .set .Lmy_kernel.has_recursion, 0
+; OL: .set .Lmy_kernel.has_indirect_call, 0
|
|
Are the symbols something you actively use in obj linking? Am I interpreting it correctly that is is just a change that elides the propagation through symbols? |
2dbe45a to
70f9ad8
Compare
9854bf4 to
2ebfe91
Compare
Yes. I thought about not emitting the symbols. They are not really actively used by the linker, but they might be used by kernel descriptor. I also considered deferring KD emission to the linker, but apparently KD has more fields that the linker can't really fill in, so we have to emit it here and then patch things up. However, we might be able to emit some placeholder here for those fields that will be patched by the linker at link time. WDYT? |
Yeah, and the MsgPack stuff possibly
In my mental model of the ideal world, the KD emission would move to linker but I also acknowledge that that might require a lot more metadata to be emitted for use in the linker (which is what you're also seeing, I think).
I think patching will have to happen for cross-TU kernel descriptors to have sensible propagated metadata, yeah To add some context on why I asked: I expected that the metadata values emission into obj would be the ones directly coming from AMDGPUResourceUsageAnalysis pass rather than the symbols so these symbols changing felt out of place for the obj linking PRs |
70f9ad8 to
2ac38e9
Compare
2ebfe91 to
ddd92c7
Compare
2ac38e9 to
fcdd649
Compare
e030fe2 to
06d91cf
Compare
|
This PR is separated from #192384, since it is independent from that. I have some ideas that we might just want to use these MC symbols for resource usage propagation in the linker, such that the Even if we don't want to emit these symbols in the future, not defining them would be a separate PR. The whole purpose of this PR is to stop propagation if object linking is enabled. |
arsenm
left a comment
There was a problem hiding this comment.
Probably should just do this unconditionally?
We can't, because these symbols are used by kernel descriptor and HSA metadata. If linker doesn't patch them later, we will have wrong numbers. |
rovka
left a comment
There was a problem hiding this comment.
LGTM with microscopic nits.
…nking is enabled With object linking the linker aggregates resource usage across TUs, so compile-time pessimism and call-graph propagation duplicate the linker's work or pollute its inputs. In this mode, skip the per-callsite conservative bumps in `AMDGPUResourceUsageAnalysis` and assign each resource symbol in `AMDGPUMCResourceInfo` a concrete local constant instead of building call-graph `max`/`or` expressions.
06d91cf to
d534b56
Compare
The linker should always do this |
…nking is enabled (llvm#192594) With object linking the linker aggregates resource usage across TUs, so compile-time pessimism and call-graph propagation duplicate the linker's work or pollute its inputs. In this mode, skip the per-callsite conservative bumps in `AMDGPUResourceUsageAnalysis` and assign each resource symbol in `AMDGPUMCResourceInfo` a concrete local constant instead of building call-graph max/or expressions.
Not really. In non-RDC mode, the linker is not expected to be invoked at all. The backend can just emit an executable. |
…nking is enabled (llvm#192594) With object linking the linker aggregates resource usage across TUs, so compile-time pessimism and call-graph propagation duplicate the linker's work or pollute its inputs. In this mode, skip the per-callsite conservative bumps in `AMDGPUResourceUsageAnalysis` and assign each resource symbol in `AMDGPUMCResourceInfo` a concrete local constant instead of building call-graph max/or expressions.
With object linking the linker aggregates resource usage across TUs, so compile-time pessimism and call-graph propagation duplicate the linker's work or pollute its inputs.
In this mode, skip the per-callsite conservative bumps in
AMDGPUResourceUsageAnalysisand assign each resource symbol inAMDGPUMCResourceInfoa concrete local constant instead of building call-graph max/or expressions.