diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp index 83012c4cb72f2..e3a85dea934ae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCResourceInfo.h" +#include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" @@ -267,6 +268,31 @@ void MCResourceInfo::gatherResourceInfo( LLVM_DEBUG(dbgs() << "MCResUse: Gathering resource information for " << FnSym->getName() << '\n'); + + auto SetToLocal = [&](int64_t Value, ResourceInfoKind RIK) { + MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext); + Sym->setVariableValue(MCConstantExpr::create(Value, OutContext)); + }; + + // When link-time object linking is enabled, set all resource symbols to + // concrete local values. + if (AMDGPUTargetMachine::EnableObjectLinking) { + LLVM_DEBUG(dbgs() << "MCResUse: object linking enabled, no call-graph " + "propagation; emitting local resource values only\n"); + SetToLocal(FRI.NumVGPR, RIK_NumVGPR); + SetToLocal(FRI.NumAGPR, RIK_NumAGPR); + SetToLocal(FRI.NumExplicitSGPR, RIK_NumSGPR); + SetToLocal(FRI.NumNamedBarrier, RIK_NumNamedBarrier); + SetToLocal(FRI.PrivateSegmentSize, RIK_PrivateSegSize); + SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC); + SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch); + SetToLocal(FRI.HasDynamicallySizedStack, + ResourceInfoKind::RIK_HasDynSizedStack); + SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion); + SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall); + return; + } + LLVM_DEBUG({ if (!FRI.Callees.empty()) { dbgs() << "MCResUse: Callees:\n"; @@ -347,14 +373,6 @@ void MCResourceInfo::gatherResourceInfo( Sym->setVariableValue(localConstExpr); } - auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) { - MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext); - LLVM_DEBUG( - dbgs() << "MCResUse: " << Sym->getName() << ": Adding " << LocalValue - << ", no further propagation as indirect callee found within\n"); - Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext)); - }; - if (!FRI.HasIndirectCall) { assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC, AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h index 5c1f59636446c..3cb063ef8e962 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h @@ -102,6 +102,10 @@ class MCResourceInfo { /// transitive maximum or accumulative. For example, if A calls B and B's VGPR /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore, /// functions with indirect calls should be assigned the module level maximum. + /// + /// When link-time object linking is enabled, skip all call-transitive + /// propagation and emit concrete per-function values for every resource + /// symbol. Cross-TU aggregation is then the linker's responsibility. void gatherResourceInfo( const MachineFunction &MF, const AMDGPUResourceUsageAnalysisWrapperPass::FunctionResourceInfo &FRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 4e664e084fb88..51bebefed5aa7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -17,6 +17,7 @@ #include "AMDGPUResourceUsageAnalysis.h" #include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -272,6 +273,15 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( Info.Callees.push_back(Callee); bool IsIndirect = !Callee || Callee->isDeclaration(); + Info.HasIndirectCall |= IsIndirect; + + // In object linking mode the linker has the full cross-TU view. It + // propagates resource usage across both direct calls to external + // declarations and true indirect calls. Skip the compile-time + // conservative assumptions so that the locally emitted metadata + // describes this function's own usage only. + if (AMDGPUTargetMachine::EnableObjectLinking) + continue; // FIXME: Call site could have norecurse on it if (!Callee || !Callee->doesNotRecurse()) { @@ -301,7 +311,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( Info.UsesVCC = true; Info.UsesFlatScratch = ST.hasFlatAddressSpace(); Info.HasDynamicallySizedStack = true; - Info.HasIndirectCall = true; } } } diff --git a/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll new file mode 100644 index 0000000000000..95214bcf7c06d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll @@ -0,0 +1,109 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s --check-prefix=DEFAULT +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=asm < %s | FileCheck %s --check-prefix=OL + +declare void @extern_callee() + +define void @calls_extern() { + call void @extern_callee() + ret void +} + +define void @calls_indirect(ptr %fptr) { + call void %fptr() + ret void +} + +define void @calls_local() { + ret void +} + +define amdgpu_kernel void @my_kernel(ptr %fptr) { + call void @calls_extern() + call void @calls_indirect(ptr %fptr) + call void @calls_local() + ret void +} + +; COM: Default mode: direct-to-extern triggers the conservative "unknown +; COM: callee" path. Register/stack-size symbols include the module-level +; COM: sinks; boolean flags are all forced to 1; HasIndirectCall is set too +; COM: (IsIndirect covers calls to declarations). +; DEFAULT: .set .Lcalls_extern.num_vgpr, max({{[0-9]+}}, amdgpu.max_num_vgpr) +; DEFAULT: .set .Lcalls_extern.num_agpr, max({{[0-9]+}}, amdgpu.max_num_agpr) +; DEFAULT: .set .Lcalls_extern.numbered_sgpr, max({{[0-9]+}}, amdgpu.max_num_sgpr) +; DEFAULT: .set .Lcalls_extern.num_named_barrier, max({{[0-9]+}}, amdgpu.max_num_named_barrier) +; DEFAULT: .set .Lcalls_extern.uses_vcc, 1 +; DEFAULT: .set .Lcalls_extern.uses_flat_scratch, 1 +; DEFAULT: .set .Lcalls_extern.has_dyn_sized_stack, 1 +; DEFAULT: .set .Lcalls_extern.has_recursion, 1 +; DEFAULT: .set .Lcalls_extern.has_indirect_call, 1 + +; COM: Object linking: the same function reports only its own local usage. +; COM: The sinks drop out of the register/stack-size expressions and the +; COM: pessimized boolean flags collapse to the true local values (UsesVCC is +; COM: still 1 here because the call-site lowering on gfx900 genuinely uses +; COM: VCC). +; OL: .set .Lcalls_extern.num_vgpr, {{[0-9]+}} +; OL: .set .Lcalls_extern.num_agpr, {{[0-9]+}} +; OL: .set .Lcalls_extern.numbered_sgpr, {{[0-9]+}} +; OL: .set .Lcalls_extern.num_named_barrier, {{[0-9]+}} +; OL: .set .Lcalls_extern.uses_vcc, 1 +; OL: .set .Lcalls_extern.uses_flat_scratch, 0 +; OL: .set .Lcalls_extern.has_dyn_sized_stack, 0 +; OL: .set .Lcalls_extern.has_recursion, 0 +; OL: .set .Lcalls_extern.has_indirect_call, 1 + +; COM: True indirect call: same DEFAULT-vs-OL behavior as the direct-to-extern +; COM: case above. In DEFAULT mode all the flags are pessimized; with object +; COM: linking only HasIndirectCall is preserved (the linker sees the call +; COM: site's typeid and address-taken set and handles propagation). +; DEFAULT: .set .Lcalls_indirect.uses_vcc, 1 +; DEFAULT: .set .Lcalls_indirect.uses_flat_scratch, 1 +; DEFAULT: .set .Lcalls_indirect.has_dyn_sized_stack, 1 +; DEFAULT: .set .Lcalls_indirect.has_recursion, 1 +; DEFAULT: .set .Lcalls_indirect.has_indirect_call, 1 + +; OL: .set .Lcalls_indirect.uses_vcc, 1 +; OL: .set .Lcalls_indirect.uses_flat_scratch, 0 +; OL: .set .Lcalls_indirect.has_dyn_sized_stack, 0 +; OL: .set .Lcalls_indirect.has_recursion, 0 +; OL: .set .Lcalls_indirect.has_indirect_call, 1 + +; COM: Baseline: a function that makes no calls outside itself reports the +; COM: same all-zero local flags in both modes. +; DEFAULT: .set .Lcalls_local.uses_vcc, 0 +; DEFAULT: .set .Lcalls_local.uses_flat_scratch, 0 +; DEFAULT: .set .Lcalls_local.has_dyn_sized_stack, 0 +; DEFAULT: .set .Lcalls_local.has_recursion, 0 +; DEFAULT: .set .Lcalls_local.has_indirect_call, 0 + +; OL: .set .Lcalls_local.uses_vcc, 0 +; OL: .set .Lcalls_local.uses_flat_scratch, 0 +; OL: .set .Lcalls_local.has_dyn_sized_stack, 0 +; OL: .set .Lcalls_local.has_recursion, 0 +; OL: .set .Lcalls_local.has_indirect_call, 0 + +; COM: Kernel side of the DEFAULT-vs-OL comparison. DEFAULT mode emits +; COM: call-graph-propagation expressions (max()/or() over every callee's +; COM: symbols) so the kernel picks up its callees' pessimized values; object +; COM: linking emits concrete literals and leaves cross-TU aggregation to the +; COM: linker. +; DEFAULT: .set .Lmy_kernel.num_vgpr, max({{[0-9]+}}, .Lcalls_extern.num_vgpr, .Lcalls_indirect.num_vgpr, .Lcalls_local.num_vgpr) +; DEFAULT: .set .Lmy_kernel.num_agpr, max({{[0-9]+}}, .Lcalls_extern.num_agpr, .Lcalls_indirect.num_agpr, .Lcalls_local.num_agpr) +; DEFAULT: .set .Lmy_kernel.num_named_barrier, max({{[0-9]+}}, .Lcalls_extern.num_named_barrier, .Lcalls_indirect.num_named_barrier, .Lcalls_local.num_named_barrier) +; DEFAULT: .set .Lmy_kernel.private_seg_size, {{[0-9]+}}+max(.Lcalls_extern.private_seg_size, .Lcalls_indirect.private_seg_size, .Lcalls_local.private_seg_size) +; DEFAULT: .set .Lmy_kernel.uses_vcc, or({{[0-9]+}}, .Lcalls_extern.uses_vcc, .Lcalls_indirect.uses_vcc, .Lcalls_local.uses_vcc) +; DEFAULT: .set .Lmy_kernel.uses_flat_scratch, or({{[0-9]+}}, .Lcalls_extern.uses_flat_scratch, .Lcalls_indirect.uses_flat_scratch, .Lcalls_local.uses_flat_scratch) +; DEFAULT: .set .Lmy_kernel.has_dyn_sized_stack, or({{[0-9]+}}, .Lcalls_extern.has_dyn_sized_stack, .Lcalls_indirect.has_dyn_sized_stack, .Lcalls_local.has_dyn_sized_stack) +; DEFAULT: .set .Lmy_kernel.has_recursion, or({{[0-9]+}}, .Lcalls_extern.has_recursion, .Lcalls_indirect.has_recursion, .Lcalls_local.has_recursion) +; DEFAULT: .set .Lmy_kernel.has_indirect_call, or({{[0-9]+}}, .Lcalls_extern.has_indirect_call, .Lcalls_indirect.has_indirect_call, .Lcalls_local.has_indirect_call) + +; OL: .set .Lmy_kernel.num_vgpr, {{[0-9]+}} +; OL: .set .Lmy_kernel.num_agpr, {{[0-9]+}} +; OL: .set .Lmy_kernel.num_named_barrier, {{[0-9]+}} +; OL: .set .Lmy_kernel.private_seg_size, {{[0-9]+}} +; OL: .set .Lmy_kernel.uses_vcc, {{[01]}} +; OL: .set .Lmy_kernel.uses_flat_scratch, {{[01]}} +; OL: .set .Lmy_kernel.has_dyn_sized_stack, 0 +; OL: .set .Lmy_kernel.has_recursion, 0 +; OL: .set .Lmy_kernel.has_indirect_call, 0