Skip to content

Commit 04ae04a

Browse files
committed
AMDGPU: share LDS budget logic and add experimental LDS buffering pass
Add AMDGPULDSBuffering pass to buffer per-thread global memory accesses through LDS. The pass transforms load-store pairs on the same global pointer into memcpy operations through LDS (global->LDS and LDS->global). The main purpose is to alleviate global memory contention and cache thrashing when the same global pointer is used for both load and store. This pass was inspired by finding that some rocrand performance tests show better performance when global memory is buffered through LDS instead of being loaded/stored to registers directly. Extract a reusable LDS budget computation helper (Utils/AMDGPULDSUtils) and refactor AMDGPUPromoteAlloca to use it. This centralizes LDS usage/limit estimation including extern dynamic shared memory and local-AS args, and ties limits to occupancy tiers consistently across passes. Gate AMDGPULDSBuffering with the same LDS budget and per-candidate accounting to avoid exceeding available LDS when multiple candidates exist. The optimization is experimental and must be enabled via the -amdgpu-enable-lds-buffering flag. It may be turned on by default later if better heuristics are created.
1 parent 50faea2 commit 04ae04a

File tree

10 files changed

+590
-114
lines changed

10 files changed

+590
-114
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,21 @@ struct AMDGPUPromoteAllocaToVectorPass
270270
TargetMachine &TM;
271271
};
272272

273+
// Buffer selected per-thread global memory through LDS to improve
274+
// performance in memory-bound kernels. This runs late and is separate
275+
// from alloca promotion.
276+
struct AMDGPULDSBufferingPass : PassInfoMixin<AMDGPULDSBufferingPass> {
277+
AMDGPULDSBufferingPass(const TargetMachine &TM) : TM(TM) {}
278+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
279+
280+
private:
281+
const TargetMachine &TM;
282+
};
283+
284+
// Legacy PM wrapper for LDS buffering
285+
FunctionPass *createAMDGPULDSBufferingLegacyPass();
286+
void initializeAMDGPULDSBufferingLegacyPass(PassRegistry &);
287+
273288
struct AMDGPUAtomicOptimizerPass : PassInfoMixin<AMDGPUAtomicOptimizerPass> {
274289
AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl)
275290
: TM(TM), ScanImpl(ScanImpl) {}
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
//===-- AMDGPULDSBuffering.cpp - Per-thread LDS buffering -----------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This pass buffers per-thread global memory accesses through LDS
10+
// (addrspace(3)) to improve performance in memory-bound kernels. The main
11+
// purpose is to alleviate global memory contention and cache thrashing when
12+
// the same global pointer is used for both load and store operations.
13+
//
14+
// The pass runs late in the pipeline, after SROA and AMDGPUPromoteAlloca,
15+
// using only leftover LDS budget to avoid interfering with other LDS
16+
// optimizations. It respects the same LDS budget constraints as
17+
// AMDGPUPromoteAlloca, ensuring that LDS usage remains within occupancy
18+
// tier limits.
19+
//
20+
// Current implementation handles the simplest pattern: a load from global
21+
// memory whose only use is a store back to the same pointer. This pattern
22+
// is transformed into a pair of memcpy operations (global->LDS and
23+
// LDS->global), effectively moving the value through LDS instead of
24+
// accessing global memory directly.
25+
//
26+
// This pass was inspired by finding that some rocrand performance tests
27+
// show better performance when global memory is buffered through LDS
28+
// instead of being loaded/stored to registers directly. This optimization
29+
// is experimental and must be enabled via the -amdgpu-enable-lds-buffering
30+
// flag.
31+
//
32+
//===----------------------------------------------------------------------===//
33+
34+
#include "AMDGPU.h"
35+
#include "GCNSubtarget.h"
36+
#include "Utils/AMDGPUBaseInfo.h"
37+
#include "Utils/AMDGPULDSUtils.h"
38+
#include "llvm/ADT/SmallVector.h"
39+
#include "llvm/IR/IRBuilder.h"
40+
#include "llvm/IR/IntrinsicsAMDGPU.h"
41+
#include "llvm/IR/PassManager.h"
42+
#include "llvm/IR/PatternMatch.h"
43+
#include "llvm/IR/Instructions.h"
44+
#include "llvm/CodeGen/TargetPassConfig.h"
45+
#include "llvm/InitializePasses.h"
46+
#include "llvm/Pass.h"
47+
#include "llvm/Support/CommandLine.h"
48+
#include "llvm/Support/Alignment.h"
49+
#include "llvm/Support/Debug.h"
50+
#include "llvm/Target/TargetMachine.h"
51+
52+
#define DEBUG_TYPE "amdgpu-lds-buffering"
53+
54+
using namespace llvm;
55+
56+
namespace {
57+
58+
static cl::opt<unsigned> LDSBufferingMaxBytes(
59+
"amdgpu-lds-buffering-max-bytes",
60+
cl::desc("Max byte size for LDS buffering candidates"), cl::init(64));
61+
62+
class AMDGPULDSBufferingImpl {
63+
const TargetMachine &TM;
64+
Module *Mod = nullptr;
65+
const DataLayout *DL = nullptr;
66+
bool IsAMDGCN = false;
67+
bool IsAMDHSA = false;
68+
69+
public:
70+
AMDGPULDSBufferingImpl(const TargetMachine &TM) : TM(TM) {}
71+
72+
bool run(Function &F) {
73+
LLVM_DEBUG(dbgs() << "[LDSBuffer] Visit function: " << F.getName()
74+
<< '\n');
75+
const Triple &TT = TM.getTargetTriple();
76+
if (!TT.isAMDGCN())
77+
return false;
78+
IsAMDGCN = true;
79+
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
80+
81+
if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
82+
return false;
83+
84+
Mod = F.getParent();
85+
DL = &Mod->getDataLayout();
86+
87+
auto Budget = computeLDSBudget(F, TM);
88+
if (!Budget.promotable)
89+
return false;
90+
uint32_t localUsage = Budget.currentUsage;
91+
uint32_t localLimit = Budget.limit;
92+
93+
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
94+
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second;
95+
96+
bool Changed = false;
97+
unsigned NumTransformed = 0;
98+
99+
// Minimal pattern: a load from AS(1) whose only use is a store back to the
100+
// exact same pointer later. Replace with global<->LDS memcpy pair to
101+
// shorten the live range and free VGPRs.
102+
SmallVector<Instruction *> ToErase;
103+
for (BasicBlock &BB : F) {
104+
for (Instruction &I : llvm::make_early_inc_range(BB)) {
105+
auto *LI = dyn_cast<LoadInst>(&I);
106+
if (!LI || LI->isVolatile())
107+
continue;
108+
109+
Type *ValTy = LI->getType();
110+
if (!ValTy->isFirstClassType())
111+
continue;
112+
113+
Value *Ptr = LI->getPointerOperand();
114+
auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
115+
if (!PtrTy || PtrTy->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
116+
continue;
117+
118+
if (!LI->hasOneUse())
119+
continue;
120+
auto *SI = dyn_cast<StoreInst>(LI->user_back());
121+
if (!SI || SI->isVolatile())
122+
continue;
123+
if (SI->getValueOperand() != LI)
124+
continue;
125+
126+
Value *SPtr = SI->getPointerOperand();
127+
if (SPtr->stripPointerCasts() != Ptr->stripPointerCasts())
128+
continue;
129+
130+
TypeSize TS = DL->getTypeStoreSize(ValTy);
131+
if (TS.isScalable())
132+
continue;
133+
uint64_t Size = TS.getFixedValue();
134+
if (Size == 0 || Size > LDSBufferingMaxBytes)
135+
continue;
136+
Align LoadAlign = LI->getAlign();
137+
Align MinAlign = Align(16);
138+
if (LoadAlign < MinAlign)
139+
continue;
140+
141+
// Create LDS slot near the load and emit memcpy global->LDS.
142+
LLVM_DEBUG({
143+
dbgs() << "[LDSBuffer] Candidate found: load->store same ptr in "
144+
<< F.getName() << '\n';
145+
dbgs() << " size=" << Size << "B, align="
146+
<< LoadAlign.value() << ", ptr AS="
147+
<< PtrTy->getAddressSpace() << "\n";
148+
});
149+
IRBuilder<> BLoad(LI);
150+
Align Alignment = LoadAlign;
151+
152+
// Ensure LDS budget allows allocating a per-thread slot.
153+
uint32_t NewSize = alignTo(localUsage, Alignment);
154+
NewSize += WorkGroupSize * static_cast<uint32_t>(Size);
155+
if (NewSize > localLimit)
156+
continue;
157+
localUsage = NewSize;
158+
auto [GV, SlotPtr] =
159+
createLDSGlobalAndThreadSlot(F, ValTy, Alignment, "ldsbuf", BLoad);
160+
// memcpy p3 <- p1
161+
LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy global->LDS: "
162+
<< GV->getName() << ", bytes=" << Size
163+
<< ", align=" << Alignment.value() << '\n');
164+
BLoad.CreateMemCpy(SlotPtr, Alignment, Ptr, Alignment, TS);
165+
166+
// Replace the final store with memcpy LDS->global.
167+
IRBuilder<> BStore(SI);
168+
LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy LDS->global: "
169+
<< GV->getName() << ", bytes=" << Size
170+
<< ", align=" << Alignment.value() << '\n');
171+
BStore.CreateMemCpy(SPtr, Alignment, SlotPtr, Alignment, TS);
172+
173+
ToErase.push_back(SI);
174+
ToErase.push_back(LI);
175+
LLVM_DEBUG(dbgs() << "[LDSBuffer] Erase original load/store pair\n");
176+
Changed = true;
177+
++NumTransformed;
178+
}
179+
}
180+
181+
for (Instruction *E : ToErase)
182+
E->eraseFromParent();
183+
184+
LLVM_DEBUG(dbgs() << "[LDSBuffer] Transformations applied: "
185+
<< NumTransformed << "\n");
186+
187+
return Changed;
188+
}
189+
190+
private:
191+
// Get local size Y and Z from the dispatch packet on HSA.
192+
std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder) {
193+
Function &F = *Builder.GetInsertBlock()->getParent();
194+
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
195+
196+
CallInst *DispatchPtr =
197+
Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
198+
DispatchPtr->addRetAttr(Attribute::NoAlias);
199+
DispatchPtr->addRetAttr(Attribute::NonNull);
200+
F.removeFnAttr("amdgpu-no-dispatch-ptr");
201+
DispatchPtr->addDereferenceableRetAttr(64);
202+
203+
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
204+
Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
205+
LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
206+
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
207+
LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
208+
MDNode *MD = MDNode::get(Mod->getContext(), {});
209+
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
210+
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
211+
ST.makeLIDRangeMetadata(LoadZU);
212+
Value *Y = Builder.CreateLShr(LoadXY, 16);
213+
return std::pair(Y, LoadZU);
214+
}
215+
216+
// Get workitem id for dimension N (0,1,2).
217+
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N) {
218+
Function *F = Builder.GetInsertBlock()->getParent();
219+
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
220+
Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
221+
StringRef AttrName;
222+
switch (N) {
223+
case 0:
224+
IntrID = Intrinsic::amdgcn_workitem_id_x;
225+
AttrName = "amdgpu-no-workitem-id-x";
226+
break;
227+
case 1:
228+
IntrID = Intrinsic::amdgcn_workitem_id_y;
229+
AttrName = "amdgpu-no-workitem-id-y";
230+
break;
231+
case 2:
232+
IntrID = Intrinsic::amdgcn_workitem_id_z;
233+
AttrName = "amdgpu-no-workitem-id-z";
234+
break;
235+
default:
236+
llvm_unreachable("invalid dimension");
237+
}
238+
Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
239+
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
240+
ST.makeLIDRangeMetadata(CI);
241+
F->removeFnAttr(AttrName);
242+
return CI;
243+
}
244+
245+
// Compute linear thread id within a workgroup.
246+
Value *buildLinearThreadId(IRBuilder<> &Builder) {
247+
Value *TCntY, *TCntZ;
248+
std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
249+
Value *TIdX = getWorkitemID(Builder, 0);
250+
Value *TIdY = getWorkitemID(Builder, 1);
251+
Value *TIdZ = getWorkitemID(Builder, 2);
252+
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
253+
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
254+
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
255+
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
256+
TID = Builder.CreateAdd(TID, TIdZ);
257+
return TID;
258+
}
259+
260+
// Create an LDS array [WGSize x ElemTy] and return pointer to per-thread slot.
261+
std::pair<GlobalVariable *, Value *>
262+
createLDSGlobalAndThreadSlot(Function &F, Type *ElemTy, Align Alignment,
263+
StringRef BaseName, IRBuilder<> &Builder) {
264+
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
265+
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second;
266+
Type *ArrTy = ArrayType::get(ElemTy, WorkGroupSize);
267+
GlobalVariable *GV = new GlobalVariable(
268+
*Mod, ArrTy, /*isConstant=*/false, GlobalValue::InternalLinkage,
269+
PoisonValue::get(ArrTy), (F.getName() + "." + BaseName).str(),
270+
nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
271+
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
272+
GV->setAlignment(Alignment);
273+
274+
LLVM_DEBUG({
275+
dbgs() << "[LDSBuffer] Create LDS global: name=" << GV->getName()
276+
<< ", elemTy=" << *ElemTy << ", WGSize=" << WorkGroupSize
277+
<< ", align=" << Alignment.value() << '\n';
278+
});
279+
280+
Value *LinearTID = buildLinearThreadId(Builder);
281+
LLVMContext &Ctx = Mod->getContext();
282+
Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Ctx)),
283+
LinearTID};
284+
Value *SlotPtr = Builder.CreateInBoundsGEP(ArrTy, GV, Indices);
285+
return {GV, SlotPtr};
286+
}
287+
};
288+
289+
} // end anonymous namespace
290+
291+
PreservedAnalyses
292+
AMDGPULDSBufferingPass::run(Function &F, FunctionAnalysisManager &AM) {
293+
bool Changed = AMDGPULDSBufferingImpl(TM).run(F);
294+
if (!Changed)
295+
return PreservedAnalyses::all();
296+
297+
PreservedAnalyses PA;
298+
PA.preserveSet<CFGAnalyses>();
299+
return PA;
300+
}
301+
302+
//===----------------------------------------------------------------------===//
303+
// Legacy PM wrapper
304+
//===----------------------------------------------------------------------===//
305+
306+
namespace {
307+
308+
class AMDGPULDSBufferingLegacy : public FunctionPass {
309+
public:
310+
static char ID;
311+
AMDGPULDSBufferingLegacy() : FunctionPass(ID) {}
312+
313+
StringRef getPassName() const override { return "AMDGPU LDS Buffering"; }
314+
315+
void getAnalysisUsage(AnalysisUsage &AU) const override {
316+
AU.setPreservesCFG();
317+
FunctionPass::getAnalysisUsage(AU);
318+
}
319+
320+
bool runOnFunction(Function &F) override {
321+
if (skipFunction(F))
322+
return false;
323+
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
324+
return AMDGPULDSBufferingImpl(TPC->getTM<TargetMachine>()).run(F);
325+
return false;
326+
}
327+
};
328+
329+
} // end anonymous namespace
330+
331+
char AMDGPULDSBufferingLegacy::ID = 0;
332+
333+
INITIALIZE_PASS_BEGIN(AMDGPULDSBufferingLegacy, DEBUG_TYPE,
334+
"AMDGPU per-thread LDS buffering", false, false)
335+
INITIALIZE_PASS_END(AMDGPULDSBufferingLegacy, DEBUG_TYPE,
336+
"AMDGPU per-thread LDS buffering", false, false)
337+
338+
FunctionPass *llvm::createAMDGPULDSBufferingLegacyPass() {
339+
return new AMDGPULDSBufferingLegacy();
340+
}

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
6060
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
6161
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
6262
AMDGPUPromoteAllocaToVectorPass(*this))
63+
FUNCTION_PASS("amdgpu-lds-buffering", AMDGPULDSBufferingPass(*this))
6364
FUNCTION_PASS("amdgpu-promote-kernel-arguments",
6465
AMDGPUPromoteKernelArgumentsPass())
6566
FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())

0 commit comments

Comments
 (0)