|
| 1 | +//===-- AMDGPULDSBuffering.cpp - Per-thread LDS buffering -----------------===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// This pass buffers per-thread global memory accesses through LDS |
| 10 | +// (addrspace(3)) to improve performance in memory-bound kernels. The main |
| 11 | +// purpose is to alleviate global memory contention and cache thrashing when |
| 12 | +// the same global pointer is used for both load and store operations. |
| 13 | +// |
| 14 | +// The pass runs late in the pipeline, after SROA and AMDGPUPromoteAlloca, |
| 15 | +// using only leftover LDS budget to avoid interfering with other LDS |
| 16 | +// optimizations. It respects the same LDS budget constraints as |
| 17 | +// AMDGPUPromoteAlloca, ensuring that LDS usage remains within occupancy |
| 18 | +// tier limits. |
| 19 | +// |
| 20 | +// Current implementation handles the simplest pattern: a load from global |
| 21 | +// memory whose only use is a store back to the same pointer. This pattern |
| 22 | +// is transformed into a pair of memcpy operations (global->LDS and |
| 23 | +// LDS->global), effectively moving the value through LDS instead of |
| 24 | +// accessing global memory directly. |
| 25 | +// |
| 26 | +// This pass was inspired by finding that some rocrand performance tests |
| 27 | +// show better performance when global memory is buffered through LDS |
| 28 | +// instead of being loaded/stored to registers directly. This optimization |
| 29 | +// is experimental and must be enabled via the -amdgpu-enable-lds-buffering |
| 30 | +// flag. |
| 31 | +// |
| 32 | +//===----------------------------------------------------------------------===// |
| 33 | + |
| 34 | +#include "AMDGPU.h" |
| 35 | +#include "GCNSubtarget.h" |
| 36 | +#include "Utils/AMDGPUBaseInfo.h" |
| 37 | +#include "Utils/AMDGPULDSUtils.h" |
| 38 | +#include "llvm/ADT/SmallVector.h" |
| 39 | +#include "llvm/IR/IRBuilder.h" |
| 40 | +#include "llvm/IR/IntrinsicsAMDGPU.h" |
| 41 | +#include "llvm/IR/PassManager.h" |
| 42 | +#include "llvm/IR/PatternMatch.h" |
| 43 | +#include "llvm/IR/Instructions.h" |
| 44 | +#include "llvm/CodeGen/TargetPassConfig.h" |
| 45 | +#include "llvm/InitializePasses.h" |
| 46 | +#include "llvm/Pass.h" |
| 47 | +#include "llvm/Support/CommandLine.h" |
| 48 | +#include "llvm/Support/Alignment.h" |
| 49 | +#include "llvm/Support/Debug.h" |
| 50 | +#include "llvm/Target/TargetMachine.h" |
| 51 | + |
| 52 | +#define DEBUG_TYPE "amdgpu-lds-buffering" |
| 53 | + |
| 54 | +using namespace llvm; |
| 55 | + |
| 56 | +namespace { |
| 57 | + |
| 58 | +static cl::opt<unsigned> LDSBufferingMaxBytes( |
| 59 | + "amdgpu-lds-buffering-max-bytes", |
| 60 | + cl::desc("Max byte size for LDS buffering candidates"), cl::init(64)); |
| 61 | + |
| 62 | +class AMDGPULDSBufferingImpl { |
| 63 | + const TargetMachine &TM; |
| 64 | + Module *Mod = nullptr; |
| 65 | + const DataLayout *DL = nullptr; |
| 66 | + bool IsAMDGCN = false; |
| 67 | + bool IsAMDHSA = false; |
| 68 | + |
| 69 | +public: |
| 70 | + AMDGPULDSBufferingImpl(const TargetMachine &TM) : TM(TM) {} |
| 71 | + |
| 72 | + bool run(Function &F) { |
| 73 | + LLVM_DEBUG(dbgs() << "[LDSBuffer] Visit function: " << F.getName() |
| 74 | + << '\n'); |
| 75 | + const Triple &TT = TM.getTargetTriple(); |
| 76 | + if (!TT.isAMDGCN()) |
| 77 | + return false; |
| 78 | + IsAMDGCN = true; |
| 79 | + IsAMDHSA = TT.getOS() == Triple::AMDHSA; |
| 80 | + |
| 81 | + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) |
| 82 | + return false; |
| 83 | + |
| 84 | + Mod = F.getParent(); |
| 85 | + DL = &Mod->getDataLayout(); |
| 86 | + |
| 87 | + auto Budget = computeLDSBudget(F, TM); |
| 88 | + if (!Budget.promotable) |
| 89 | + return false; |
| 90 | + uint32_t localUsage = Budget.currentUsage; |
| 91 | + uint32_t localLimit = Budget.limit; |
| 92 | + |
| 93 | + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
| 94 | + unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second; |
| 95 | + |
| 96 | + bool Changed = false; |
| 97 | + unsigned NumTransformed = 0; |
| 98 | + |
| 99 | + // Minimal pattern: a load from AS(1) whose only use is a store back to the |
| 100 | + // exact same pointer later. Replace with global<->LDS memcpy pair to |
| 101 | + // shorten the live range and free VGPRs. |
| 102 | + SmallVector<Instruction *> ToErase; |
| 103 | + for (BasicBlock &BB : F) { |
| 104 | + for (Instruction &I : llvm::make_early_inc_range(BB)) { |
| 105 | + auto *LI = dyn_cast<LoadInst>(&I); |
| 106 | + if (!LI || LI->isVolatile()) |
| 107 | + continue; |
| 108 | + |
| 109 | + Type *ValTy = LI->getType(); |
| 110 | + if (!ValTy->isFirstClassType()) |
| 111 | + continue; |
| 112 | + |
| 113 | + Value *Ptr = LI->getPointerOperand(); |
| 114 | + auto *PtrTy = dyn_cast<PointerType>(Ptr->getType()); |
| 115 | + if (!PtrTy || PtrTy->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) |
| 116 | + continue; |
| 117 | + |
| 118 | + if (!LI->hasOneUse()) |
| 119 | + continue; |
| 120 | + auto *SI = dyn_cast<StoreInst>(LI->user_back()); |
| 121 | + if (!SI || SI->isVolatile()) |
| 122 | + continue; |
| 123 | + if (SI->getValueOperand() != LI) |
| 124 | + continue; |
| 125 | + |
| 126 | + Value *SPtr = SI->getPointerOperand(); |
| 127 | + if (SPtr->stripPointerCasts() != Ptr->stripPointerCasts()) |
| 128 | + continue; |
| 129 | + |
| 130 | + TypeSize TS = DL->getTypeStoreSize(ValTy); |
| 131 | + if (TS.isScalable()) |
| 132 | + continue; |
| 133 | + uint64_t Size = TS.getFixedValue(); |
| 134 | + if (Size == 0 || Size > LDSBufferingMaxBytes) |
| 135 | + continue; |
| 136 | + Align LoadAlign = LI->getAlign(); |
| 137 | + Align MinAlign = Align(16); |
| 138 | + if (LoadAlign < MinAlign) |
| 139 | + continue; |
| 140 | + |
| 141 | + // Create LDS slot near the load and emit memcpy global->LDS. |
| 142 | + LLVM_DEBUG({ |
| 143 | + dbgs() << "[LDSBuffer] Candidate found: load->store same ptr in " |
| 144 | + << F.getName() << '\n'; |
| 145 | + dbgs() << " size=" << Size << "B, align=" |
| 146 | + << LoadAlign.value() << ", ptr AS=" |
| 147 | + << PtrTy->getAddressSpace() << "\n"; |
| 148 | + }); |
| 149 | + IRBuilder<> BLoad(LI); |
| 150 | + Align Alignment = LoadAlign; |
| 151 | + |
| 152 | + // Ensure LDS budget allows allocating a per-thread slot. |
| 153 | + uint32_t NewSize = alignTo(localUsage, Alignment); |
| 154 | + NewSize += WorkGroupSize * static_cast<uint32_t>(Size); |
| 155 | + if (NewSize > localLimit) |
| 156 | + continue; |
| 157 | + localUsage = NewSize; |
| 158 | + auto [GV, SlotPtr] = |
| 159 | + createLDSGlobalAndThreadSlot(F, ValTy, Alignment, "ldsbuf", BLoad); |
| 160 | + // memcpy p3 <- p1 |
| 161 | + LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy global->LDS: " |
| 162 | + << GV->getName() << ", bytes=" << Size |
| 163 | + << ", align=" << Alignment.value() << '\n'); |
| 164 | + BLoad.CreateMemCpy(SlotPtr, Alignment, Ptr, Alignment, TS); |
| 165 | + |
| 166 | + // Replace the final store with memcpy LDS->global. |
| 167 | + IRBuilder<> BStore(SI); |
| 168 | + LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy LDS->global: " |
| 169 | + << GV->getName() << ", bytes=" << Size |
| 170 | + << ", align=" << Alignment.value() << '\n'); |
| 171 | + BStore.CreateMemCpy(SPtr, Alignment, SlotPtr, Alignment, TS); |
| 172 | + |
| 173 | + ToErase.push_back(SI); |
| 174 | + ToErase.push_back(LI); |
| 175 | + LLVM_DEBUG(dbgs() << "[LDSBuffer] Erase original load/store pair\n"); |
| 176 | + Changed = true; |
| 177 | + ++NumTransformed; |
| 178 | + } |
| 179 | + } |
| 180 | + |
| 181 | + for (Instruction *E : ToErase) |
| 182 | + E->eraseFromParent(); |
| 183 | + |
| 184 | + LLVM_DEBUG(dbgs() << "[LDSBuffer] Transformations applied: " |
| 185 | + << NumTransformed << "\n"); |
| 186 | + |
| 187 | + return Changed; |
| 188 | + } |
| 189 | + |
| 190 | +private: |
| 191 | + // Get local size Y and Z from the dispatch packet on HSA. |
| 192 | + std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder) { |
| 193 | + Function &F = *Builder.GetInsertBlock()->getParent(); |
| 194 | + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
| 195 | + |
| 196 | + CallInst *DispatchPtr = |
| 197 | + Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}); |
| 198 | + DispatchPtr->addRetAttr(Attribute::NoAlias); |
| 199 | + DispatchPtr->addRetAttr(Attribute::NonNull); |
| 200 | + F.removeFnAttr("amdgpu-no-dispatch-ptr"); |
| 201 | + DispatchPtr->addDereferenceableRetAttr(64); |
| 202 | + |
| 203 | + Type *I32Ty = Type::getInt32Ty(Mod->getContext()); |
| 204 | + Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1); |
| 205 | + LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); |
| 206 | + Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2); |
| 207 | + LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); |
| 208 | + MDNode *MD = MDNode::get(Mod->getContext(), {}); |
| 209 | + LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); |
| 210 | + LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); |
| 211 | + ST.makeLIDRangeMetadata(LoadZU); |
| 212 | + Value *Y = Builder.CreateLShr(LoadXY, 16); |
| 213 | + return std::pair(Y, LoadZU); |
| 214 | + } |
| 215 | + |
| 216 | + // Get workitem id for dimension N (0,1,2). |
| 217 | + Value *getWorkitemID(IRBuilder<> &Builder, unsigned N) { |
| 218 | + Function *F = Builder.GetInsertBlock()->getParent(); |
| 219 | + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F); |
| 220 | + Intrinsic::ID IntrID = Intrinsic::not_intrinsic; |
| 221 | + StringRef AttrName; |
| 222 | + switch (N) { |
| 223 | + case 0: |
| 224 | + IntrID = Intrinsic::amdgcn_workitem_id_x; |
| 225 | + AttrName = "amdgpu-no-workitem-id-x"; |
| 226 | + break; |
| 227 | + case 1: |
| 228 | + IntrID = Intrinsic::amdgcn_workitem_id_y; |
| 229 | + AttrName = "amdgpu-no-workitem-id-y"; |
| 230 | + break; |
| 231 | + case 2: |
| 232 | + IntrID = Intrinsic::amdgcn_workitem_id_z; |
| 233 | + AttrName = "amdgpu-no-workitem-id-z"; |
| 234 | + break; |
| 235 | + default: |
| 236 | + llvm_unreachable("invalid dimension"); |
| 237 | + } |
| 238 | + Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID); |
| 239 | + CallInst *CI = Builder.CreateCall(WorkitemIdFn); |
| 240 | + ST.makeLIDRangeMetadata(CI); |
| 241 | + F->removeFnAttr(AttrName); |
| 242 | + return CI; |
| 243 | + } |
| 244 | + |
| 245 | + // Compute linear thread id within a workgroup. |
| 246 | + Value *buildLinearThreadId(IRBuilder<> &Builder) { |
| 247 | + Value *TCntY, *TCntZ; |
| 248 | + std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); |
| 249 | + Value *TIdX = getWorkitemID(Builder, 0); |
| 250 | + Value *TIdY = getWorkitemID(Builder, 1); |
| 251 | + Value *TIdZ = getWorkitemID(Builder, 2); |
| 252 | + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); |
| 253 | + Tmp0 = Builder.CreateMul(Tmp0, TIdX); |
| 254 | + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); |
| 255 | + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); |
| 256 | + TID = Builder.CreateAdd(TID, TIdZ); |
| 257 | + return TID; |
| 258 | + } |
| 259 | + |
| 260 | + // Create an LDS array [WGSize x ElemTy] and return pointer to per-thread slot. |
| 261 | + std::pair<GlobalVariable *, Value *> |
| 262 | + createLDSGlobalAndThreadSlot(Function &F, Type *ElemTy, Align Alignment, |
| 263 | + StringRef BaseName, IRBuilder<> &Builder) { |
| 264 | + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); |
| 265 | + unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second; |
| 266 | + Type *ArrTy = ArrayType::get(ElemTy, WorkGroupSize); |
| 267 | + GlobalVariable *GV = new GlobalVariable( |
| 268 | + *Mod, ArrTy, /*isConstant=*/false, GlobalValue::InternalLinkage, |
| 269 | + PoisonValue::get(ArrTy), (F.getName() + "." + BaseName).str(), |
| 270 | + nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); |
| 271 | + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); |
| 272 | + GV->setAlignment(Alignment); |
| 273 | + |
| 274 | + LLVM_DEBUG({ |
| 275 | + dbgs() << "[LDSBuffer] Create LDS global: name=" << GV->getName() |
| 276 | + << ", elemTy=" << *ElemTy << ", WGSize=" << WorkGroupSize |
| 277 | + << ", align=" << Alignment.value() << '\n'; |
| 278 | + }); |
| 279 | + |
| 280 | + Value *LinearTID = buildLinearThreadId(Builder); |
| 281 | + LLVMContext &Ctx = Mod->getContext(); |
| 282 | + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Ctx)), |
| 283 | + LinearTID}; |
| 284 | + Value *SlotPtr = Builder.CreateInBoundsGEP(ArrTy, GV, Indices); |
| 285 | + return {GV, SlotPtr}; |
| 286 | + } |
| 287 | +}; |
| 288 | + |
| 289 | +} // end anonymous namespace |
| 290 | + |
| 291 | +PreservedAnalyses |
| 292 | +AMDGPULDSBufferingPass::run(Function &F, FunctionAnalysisManager &AM) { |
| 293 | + bool Changed = AMDGPULDSBufferingImpl(TM).run(F); |
| 294 | + if (!Changed) |
| 295 | + return PreservedAnalyses::all(); |
| 296 | + |
| 297 | + PreservedAnalyses PA; |
| 298 | + PA.preserveSet<CFGAnalyses>(); |
| 299 | + return PA; |
| 300 | +} |
| 301 | + |
| 302 | +//===----------------------------------------------------------------------===// |
| 303 | +// Legacy PM wrapper |
| 304 | +//===----------------------------------------------------------------------===// |
| 305 | + |
| 306 | +namespace { |
| 307 | + |
| 308 | +class AMDGPULDSBufferingLegacy : public FunctionPass { |
| 309 | +public: |
| 310 | + static char ID; |
| 311 | + AMDGPULDSBufferingLegacy() : FunctionPass(ID) {} |
| 312 | + |
| 313 | + StringRef getPassName() const override { return "AMDGPU LDS Buffering"; } |
| 314 | + |
| 315 | + void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 316 | + AU.setPreservesCFG(); |
| 317 | + FunctionPass::getAnalysisUsage(AU); |
| 318 | + } |
| 319 | + |
| 320 | + bool runOnFunction(Function &F) override { |
| 321 | + if (skipFunction(F)) |
| 322 | + return false; |
| 323 | + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) |
| 324 | + return AMDGPULDSBufferingImpl(TPC->getTM<TargetMachine>()).run(F); |
| 325 | + return false; |
| 326 | + } |
| 327 | +}; |
| 328 | + |
| 329 | +} // end anonymous namespace |
| 330 | + |
| 331 | +char AMDGPULDSBufferingLegacy::ID = 0; |
| 332 | + |
| 333 | +INITIALIZE_PASS_BEGIN(AMDGPULDSBufferingLegacy, DEBUG_TYPE, |
| 334 | + "AMDGPU per-thread LDS buffering", false, false) |
| 335 | +INITIALIZE_PASS_END(AMDGPULDSBufferingLegacy, DEBUG_TYPE, |
| 336 | + "AMDGPU per-thread LDS buffering", false, false) |
| 337 | + |
| 338 | +FunctionPass *llvm::createAMDGPULDSBufferingLegacyPass() { |
| 339 | + return new AMDGPULDSBufferingLegacy(); |
| 340 | +} |
0 commit comments