Skip to content

Commit

Permalink
[AMDGPU] Set amdgpu-memory-bound if a basic block has dense global me…
Browse files Browse the repository at this point in the history
…mory access

AMDGPUPerfHintAnalysis doesn't set the memory bound attribute if
FuncInfo::InstCost outweighs MemInstCost even if we have a basic block
with relatively high global memory access. GCNSchedStrategy could revert
optimal scheduling in favour of occupancy which seems to degrade
performance for some kernels. This change introduces the
HasDenseGlobalMemAcc metric in the heuristic that makes the analysis
more conservative in these cases.

This fixes SWDEV-334259/SWDEV-343932

Differential Revision: https://reviews.llvm.org/D129759
  • Loading branch information
abinavpp committed Jul 19, 2022
1 parent 4baf8f0 commit 9fa425c
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 2 deletions.
39 changes: 39 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
Expand Up @@ -116,6 +116,7 @@ struct AMDGPUPerfHint {

bool isGlobalAddr(const Value *V) const;
bool isLocalAddr(const Value *V) const;
bool isGlobalLoadUsedInBB(const Instruction &) const;
};

static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
Expand Down Expand Up @@ -196,16 +197,39 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
return false;
}

// Returns true if the global load `I` is used in its own basic block.
bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
const auto *Ld = dyn_cast<LoadInst>(&I);
if (!Ld)
return false;
if (!isGlobalAddr(Ld->getPointerOperand()))
return false;

for (const User *Usr : Ld->users()) {
if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
if (UsrInst->getParent() == I.getParent())
return true;
}
}

return false;
}

AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];

LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');

for (auto &B : F) {
LastAccess = MemAccessInfo();
unsigned UsedGlobalLoadsInBB = 0;
for (auto &I : B) {
if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
// TODO: Check if the global load and its user are close to each other
// instead (Or do this analysis in GCNSchedStrategy?).
if (isGlobalLoadUsedInBB(I))
UsedGlobalLoadsInBB += Size;
if (isIndirectAccess(&I))
FI.IAMInstCost += Size;
if (isLargeStride(&I))
Expand Down Expand Up @@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
++FI.InstCost;
}
}

if (!FI.HasDenseGlobalMemAcc) {
unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
if (GlobalMemAccPercentage > 50) {
LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
<< B.getName() << " has " << GlobalMemAccPercentage
<< "% global memory access\n");
FI.HasDenseGlobalMemAcc = true;
}
}
}

return &FI;
Expand Down Expand Up @@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
}

bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
// Reverting optimal scheduling in favour of occupancy with basic block(s)
// having dense global memory access can potentially hurt performance.
if (FI.HasDenseGlobalMemAcc)
return true;

return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}

Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
Expand Up @@ -41,7 +41,11 @@ struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass {
unsigned InstCost;
unsigned IAMInstCost; // Indirect access memory instruction count
unsigned LSMInstCost; // Large stride memory instruction count
FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
// high global memory access
FuncInfo()
: MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0),
HasDenseGlobalMemAcc(false) {}
};

typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/perfhint.ll
Expand Up @@ -20,7 +20,7 @@ bb:
}

; GCN-LABEL: {{^}}test_membound_1:
; GCN: MemoryBound: 0
; GCN: MemoryBound: 1
define amdgpu_kernel void @test_membound_1(<2 x double> addrspace(1)* nocapture readonly %ptr.0,
<2 x double> addrspace(1)* nocapture %ptr.1,
<2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
Expand Down

0 comments on commit 9fa425c

Please sign in to comment.