diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 67042b700c047..400fa686edc4d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -270,6 +270,21 @@ struct AMDGPUPromoteAllocaToVectorPass TargetMachine &TM; }; +// Buffer selected per-thread global memory through LDS to improve +// performance in memory-bound kernels. This runs late and is separate +// from alloca promotion. +struct AMDGPULDSBufferingPass : PassInfoMixin { + AMDGPULDSBufferingPass(const TargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + const TargetMachine &TM; +}; + +// Legacy PM wrapper for LDS buffering +FunctionPass *createAMDGPULDSBufferingLegacyPass(); +void initializeAMDGPULDSBufferingLegacyPass(PassRegistry &); + struct AMDGPUAtomicOptimizerPass : PassInfoMixin { AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl) : TM(TM), ScanImpl(ScanImpl) {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPULDSBuffering.cpp b/llvm/lib/Target/AMDGPU/AMDGPULDSBuffering.cpp new file mode 100644 index 0000000000000..91e7823a7a277 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULDSBuffering.cpp @@ -0,0 +1,341 @@ +//===-- AMDGPULDSBuffering.cpp - Per-thread LDS buffering -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass buffers per-thread global memory accesses through LDS +// (addrspace(3)) to improve performance in memory-bound kernels. The main +// purpose is to alleviate global memory contention and cache thrashing when +// the same global pointer is used for both load and store operations. +// +// The pass runs late in the pipeline, after SROA and AMDGPUPromoteAlloca, +// using only leftover LDS budget to avoid interfering with other LDS +// optimizations. It respects the same LDS budget constraints as +// AMDGPUPromoteAlloca, ensuring that LDS usage remains within occupancy +// tier limits. +// +// Current implementation handles the simplest pattern: a load from global +// memory whose only use is a store back to the same pointer. This pattern +// is transformed into a pair of memcpy operations (global->LDS and +// LDS->global), effectively moving the value through LDS instead of +// accessing global memory directly. +// +// This pass was inspired by finding that some rocrand performance tests +// show better performance when global memory is buffered through LDS +// instead of being loaded/stored to registers directly. This optimization +// is experimental and must be enabled via the -amdgpu-enable-lds-buffering +// flag. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "amdgpu-lds-buffering" + +using namespace llvm; + +namespace { + +static cl::opt + LDSBufferingMaxBytes("amdgpu-lds-buffering-max-bytes", + cl::desc("Max byte size for LDS buffering candidates"), + cl::init(64)); + +class AMDGPULDSBufferingImpl { + const TargetMachine &TM; + Module *Mod = nullptr; + const DataLayout *DL = nullptr; + bool IsAMDGCN = false; + bool IsAMDHSA = false; + +public: + AMDGPULDSBufferingImpl(const TargetMachine &TM) : TM(TM) {} + + bool run(Function &F) { + LLVM_DEBUG(dbgs() << "[LDSBuffer] Visit function: " << F.getName() << '\n'); + const Triple &TT = TM.getTargetTriple(); + if (!TT.isAMDGCN()) + return false; + IsAMDGCN = true; + IsAMDHSA = TT.getOS() == Triple::AMDHSA; + + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + Mod = F.getParent(); + DL = &Mod->getDataLayout(); + + auto Budget = computeLDSBudget(F, TM); + if (!Budget.promotable) + return false; + uint32_t localUsage = Budget.currentUsage; + uint32_t localLimit = Budget.limit; + + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second; + + bool Changed = false; + unsigned NumTransformed = 0; + + // Minimal pattern: a load from AS(1) whose only use is a store back to the + // exact same pointer later. Replace with global<->LDS memcpy pair to + // shorten the live range and free VGPRs. + SmallVector ToErase; + for (BasicBlock &BB : F) { + for (Instruction &I : llvm::make_early_inc_range(BB)) { + auto *LI = dyn_cast(&I); + if (!LI || LI->isVolatile()) + continue; + + Type *ValTy = LI->getType(); + if (!ValTy->isFirstClassType()) + continue; + + Value *Ptr = LI->getPointerOperand(); + auto *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy || PtrTy->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) + continue; + + if (!LI->hasOneUse()) + continue; + auto *SI = dyn_cast(LI->user_back()); + if (!SI || SI->isVolatile()) + continue; + if (SI->getValueOperand() != LI) + continue; + + Value *SPtr = SI->getPointerOperand(); + if (SPtr->stripPointerCasts() != Ptr->stripPointerCasts()) + continue; + + TypeSize TS = DL->getTypeStoreSize(ValTy); + if (TS.isScalable()) + continue; + uint64_t Size = TS.getFixedValue(); + if (Size == 0 || Size > LDSBufferingMaxBytes) + continue; + Align LoadAlign = LI->getAlign(); + Align MinAlign = Align(16); + if (LoadAlign < MinAlign) + continue; + + // Create LDS slot near the load and emit memcpy global->LDS. + LLVM_DEBUG({ + dbgs() << "[LDSBuffer] Candidate found: load->store same ptr in " + << F.getName() << '\n'; + dbgs() << " size=" << Size + << "B, align=" << LoadAlign.value() + << ", ptr AS=" << PtrTy->getAddressSpace() << "\n"; + }); + IRBuilder<> BLoad(LI); + Align Alignment = LoadAlign; + + // Ensure LDS budget allows allocating a per-thread slot. + uint32_t NewSize = alignTo(localUsage, Alignment); + NewSize += WorkGroupSize * static_cast(Size); + if (NewSize > localLimit) + continue; + localUsage = NewSize; + auto [GV, SlotPtr] = + createLDSGlobalAndThreadSlot(F, ValTy, Alignment, "ldsbuf", BLoad); + // memcpy p3 <- p1 + LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy global->LDS: " + << GV->getName() << ", bytes=" << Size + << ", align=" << Alignment.value() << '\n'); + BLoad.CreateMemCpy(SlotPtr, Alignment, Ptr, Alignment, TS); + + // Replace the final store with memcpy LDS->global. + IRBuilder<> BStore(SI); + LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy LDS->global: " + << GV->getName() << ", bytes=" << Size + << ", align=" << Alignment.value() << '\n'); + BStore.CreateMemCpy(SPtr, Alignment, SlotPtr, Alignment, TS); + + ToErase.push_back(SI); + ToErase.push_back(LI); + LLVM_DEBUG(dbgs() << "[LDSBuffer] Erase original load/store pair\n"); + Changed = true; + ++NumTransformed; + } + } + + for (Instruction *E : ToErase) + E->eraseFromParent(); + + LLVM_DEBUG(dbgs() << "[LDSBuffer] Transformations applied: " + << NumTransformed << "\n"); + + return Changed; + } + +private: + // Get local size Y and Z from the dispatch packet on HSA. + std::pair getLocalSizeYZ(IRBuilder<> &Builder) { + Function &F = *Builder.GetInsertBlock()->getParent(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + + CallInst *DispatchPtr = + Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}); + DispatchPtr->addRetAttr(Attribute::NoAlias); + DispatchPtr->addRetAttr(Attribute::NonNull); + F.removeFnAttr("amdgpu-no-dispatch-ptr"); + DispatchPtr->addDereferenceableRetAttr(64); + + Type *I32Ty = Type::getInt32Ty(Mod->getContext()); + Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1); + LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); + Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2); + LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); + MDNode *MD = MDNode::get(Mod->getContext(), {}); + LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); + LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); + ST.makeLIDRangeMetadata(LoadZU); + Value *Y = Builder.CreateLShr(LoadXY, 16); + return std::pair(Y, LoadZU); + } + + // Get workitem id for dimension N (0,1,2). + Value *getWorkitemID(IRBuilder<> &Builder, unsigned N) { + Function *F = Builder.GetInsertBlock()->getParent(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F); + Intrinsic::ID IntrID = Intrinsic::not_intrinsic; + StringRef AttrName; + switch (N) { + case 0: + IntrID = Intrinsic::amdgcn_workitem_id_x; + AttrName = "amdgpu-no-workitem-id-x"; + break; + case 1: + IntrID = Intrinsic::amdgcn_workitem_id_y; + AttrName = "amdgpu-no-workitem-id-y"; + break; + case 2: + IntrID = Intrinsic::amdgcn_workitem_id_z; + AttrName = "amdgpu-no-workitem-id-z"; + break; + default: + llvm_unreachable("invalid dimension"); + } + Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID); + CallInst *CI = Builder.CreateCall(WorkitemIdFn); + ST.makeLIDRangeMetadata(CI); + F->removeFnAttr(AttrName); + return CI; + } + + // Compute linear thread id within a workgroup. + Value *buildLinearThreadId(IRBuilder<> &Builder) { + Value *TCntY, *TCntZ; + std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); + Value *TIdX = getWorkitemID(Builder, 0); + Value *TIdY = getWorkitemID(Builder, 1); + Value *TIdZ = getWorkitemID(Builder, 2); + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); + Tmp0 = Builder.CreateMul(Tmp0, TIdX); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); + TID = Builder.CreateAdd(TID, TIdZ); + return TID; + } + + // Create an LDS array [WGSize x ElemTy] and return pointer to per-thread + // slot. + std::pair + createLDSGlobalAndThreadSlot(Function &F, Type *ElemTy, Align Alignment, + StringRef BaseName, IRBuilder<> &Builder) { + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second; + Type *ArrTy = ArrayType::get(ElemTy, WorkGroupSize); + GlobalVariable *GV = new GlobalVariable( + *Mod, ArrTy, /*isConstant=*/false, GlobalValue::InternalLinkage, + PoisonValue::get(ArrTy), (F.getName() + "." + BaseName).str(), nullptr, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + GV->setAlignment(Alignment); + + LLVM_DEBUG({ + dbgs() << "[LDSBuffer] Create LDS global: name=" << GV->getName() + << ", elemTy=" << *ElemTy << ", WGSize=" << WorkGroupSize + << ", align=" << Alignment.value() << '\n'; + }); + + Value *LinearTID = buildLinearThreadId(Builder); + LLVMContext &Ctx = Mod->getContext(); + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Ctx)), + LinearTID}; + Value *SlotPtr = Builder.CreateInBoundsGEP(ArrTy, GV, Indices); + return {GV, SlotPtr}; + } +}; + +} // end anonymous namespace + +PreservedAnalyses AMDGPULDSBufferingPass::run(Function &F, + FunctionAnalysisManager &AM) { + bool Changed = AMDGPULDSBufferingImpl(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} + +//===----------------------------------------------------------------------===// +// Legacy PM wrapper +//===----------------------------------------------------------------------===// + +namespace { + +class AMDGPULDSBufferingLegacy : public FunctionPass { +public: + static char ID; + AMDGPULDSBufferingLegacy() : FunctionPass(ID) {} + + StringRef getPassName() const override { return "AMDGPU LDS Buffering"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + if (auto *TPC = getAnalysisIfAvailable()) + return AMDGPULDSBufferingImpl(TPC->getTM()).run(F); + return false; + } +}; + +} // end anonymous namespace + +char AMDGPULDSBufferingLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPULDSBufferingLegacy, DEBUG_TYPE, + "AMDGPU per-thread LDS buffering", false, false) +INITIALIZE_PASS_END(AMDGPULDSBufferingLegacy, DEBUG_TYPE, + "AMDGPU per-thread LDS buffering", false, false) + +FunctionPass *llvm::createAMDGPULDSBufferingLegacyPass() { + return new AMDGPULDSBufferingLegacy(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index bf6f1a9dbf576..45eb503bb981e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -60,6 +60,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes", FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this)) FUNCTION_PASS("amdgpu-promote-alloca-to-vector", AMDGPUPromoteAllocaToVectorPass(*this)) +FUNCTION_PASS("amdgpu-lds-buffering", AMDGPULDSBufferingPass(*this)) FUNCTION_PASS("amdgpu-promote-kernel-arguments", AMDGPUPromoteKernelArgumentsPass()) FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index ddabd25894414..c5073d57618d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -28,6 +28,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InstSimplifyFolder.h" @@ -1350,129 +1351,23 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( } bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { - - FunctionType *FTy = F.getFunctionType(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (Type *ParamTy : FTy->params()) { - PointerType *PtrTy = dyn_cast(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LocalMemLimit = 0; + AMDGPULDSBudget Budget = computeLDSBudget(F, TM); + CurrentLocalMemUsage = Budget.currentUsage; + LocalMemLimit = Budget.limit; + if (!Budget.promotable) { + if (Budget.disabledDueToLocalArg) { LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); - return false; - } - } - - LocalMemLimit = ST.getAddressableLocalMemorySize(); - if (LocalMemLimit == 0) - return false; - - SmallVector Stack; - SmallPtrSet VisitedConstants; - SmallPtrSet UsedLDS; - - auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { - for (const User *U : Val->users()) { - if (const Instruction *Use = dyn_cast(U)) { - if (Use->getParent()->getParent() == &F) - return true; - } else { - const Constant *C = cast(U); - if (VisitedConstants.insert(C).second) - Stack.push_back(C); - } - } - - return false; - }; - - for (GlobalVariable &GV : Mod->globals()) { - if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) - continue; - - if (visitUsers(&GV, &GV)) { - UsedLDS.insert(&GV); - Stack.clear(); - continue; - } - - // For any ConstantExpr uses, we need to recursively search the users until - // we see a function. - while (!Stack.empty()) { - const Constant *C = Stack.pop_back_val(); - if (visitUsers(&GV, C)) { - UsedLDS.insert(&GV); - Stack.clear(); - break; - } - } - } - - const DataLayout &DL = Mod->getDataLayout(); - SmallVector, 16> AllocatedSizes; - AllocatedSizes.reserve(UsedLDS.size()); - - for (const GlobalVariable *GV : UsedLDS) { - Align Alignment = - DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); - uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); - - // HIP uses an extern unsized array in local address space for dynamically - // allocated shared memory. In that case, we have to disable the promotion. - if (GV->hasExternalLinkage() && AllocSize == 0) { - LocalMemLimit = 0; + } else if (Budget.disabledDueToExternDynShared) { LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated " "local memory. Promoting to local memory " "disabled.\n"); - return false; } - - AllocatedSizes.emplace_back(AllocSize, Alignment); - } - - // Sort to try to estimate the worst case alignment padding - // - // FIXME: We should really do something to fix the addresses to a more optimal - // value instead - llvm::sort(AllocatedSizes, llvm::less_second()); - - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; - - // FIXME: Try to account for padding here. The real padding and address is - // currently determined from the inverse order of uses in the function when - // legalizing, which could also potentially change. We try to estimate the - // worst case here, but we probably should fix the addresses earlier. - for (auto Alloc : AllocatedSizes) { - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); - CurrentLocalMemUsage += Alloc.first; - } - - unsigned MaxOccupancy = - ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F) - .second; - - // Round up to the next tier of usage. - unsigned MaxSizeWithWaveCount = - ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - - // Program may already use more LDS than is usable at maximum occupancy. - if (CurrentLocalMemUsage > MaxSizeWithWaveCount) return false; - - LocalMemLimit = MaxSizeWithWaveCount; + } LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage - << " bytes of LDS\n" - << " Rounding size to " << MaxSizeWithWaveCount - << " with a maximum occupancy of " << MaxOccupancy << '\n' - << " and " << (LocalMemLimit - CurrentLocalMemUsage) - << " available for promotion\n"); - + << " bytes of LDS\n"); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b87b54ffc4f12..71e9e4f846f03 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -491,6 +491,12 @@ static cl::opt EnableImageIntrinsicOptimizer( cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden); +// Gate insertion of the AMDGPU LDS Buffering pass into the default pipeline. +static cl::opt EnableLDSBuffering( + "amdgpu-enable-lds-buffering", + cl::desc("Enable AMDGPU LDS Buffering pass in the default pipeline"), + cl::init(false), cl::Hidden); + static cl::opt EnableLoopPrefetch("amdgpu-loop-prefetch", cl::desc("Enable loop data prefetch on AMDGPU"), @@ -579,6 +585,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPURegBankCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); + initializeAMDGPULDSBufferingLegacyPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPULateCodeGenPrepareLegacyPass(*PR); initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR); @@ -1353,6 +1360,9 @@ void AMDGPUPassConfig::addIRPasses() { if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(createAMDGPUPromoteAlloca()); + // Run per-thread LDS buffering after promote-alloca to use leftover LDS. + if (EnableLDSBuffering) + addPass(createAMDGPULDSBufferingLegacyPass()); if (isPassEnabled(EnableScalarIRPasses)) addStraightLineScalarOptimizationPasses(); @@ -2096,6 +2106,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(AMDGPUPromoteAllocaPass(TM)); + // Run per-thread LDS buffering after promote-alloca to use leftover LDS. + if (EnableLDSBuffering) + addPass(AMDGPULDSBufferingPass(TM)); if (isPassEnabled(EnableScalarIRPasses)) addStraightLineScalarOptimizationPasses(addPass); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index a1e0e5293c706..aa97a2106108d 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -80,6 +80,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp + AMDGPULDSBuffering.cpp AMDGPUPrepareAGPRAlloc.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp new file mode 100644 index 0000000000000..b7b51056a332b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -0,0 +1,146 @@ +//===-- AMDGPULDSUtils.cpp - AMDGPU LDS utilities ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Shared helpers for computing LDS usage and limits for an AMDGPU function. +// +//===----------------------------------------------------------------------===// + +#include "Utils/AMDGPULDSUtils.h" + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Alignment.h" + +using namespace llvm; + +AMDGPULDSBudget llvm::computeLDSBudget(const Function &F, + const TargetMachine &TM) { + AMDGPULDSBudget Result; + + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + const Module *M = F.getParent(); + const DataLayout &DL = M->getDataLayout(); + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory. + FunctionType *FTy = F.getFunctionType(); + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + Result.limit = 0; + Result.promotable = false; + Result.disabledDueToLocalArg = true; + return Result; + } + } + + uint32_t LocalMemLimit = ST.getAddressableLocalMemorySize(); + if (LocalMemLimit == 0) { + Result.limit = 0; + Result.promotable = false; + return Result; + } + + SmallVector Stack; + SmallPtrSet VisitedConstants; + SmallPtrSet UsedLDS; + + auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { + for (const User *U : Val->users()) { + if (const Instruction *Use = dyn_cast(U)) { + if (Use->getParent()->getParent() == &F) + return true; + } else { + const Constant *C = cast(U); + if (VisitedConstants.insert(C).second) + Stack.push_back(C); + } + } + return false; + }; + + for (const GlobalVariable &GV : M->globals()) { + if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + + if (visitUsers(&GV, &GV)) { + UsedLDS.insert(&GV); + Stack.clear(); + continue; + } + + while (!Stack.empty()) { + const Constant *C = Stack.pop_back_val(); + if (visitUsers(&GV, C)) { + UsedLDS.insert(&GV); + Stack.clear(); + break; + } + } + } + + SmallVector, 16> AllocatedSizes; + AllocatedSizes.reserve(UsedLDS.size()); + + for (const GlobalVariable *GV : UsedLDS) { + Align Alignment = + DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); + uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); + + // HIP uses an extern unsized array in local address space for dynamically + // allocated shared memory. + if (GV->hasExternalLinkage() && AllocSize == 0) { + Result.limit = 0; + Result.promotable = false; + Result.disabledDueToExternDynShared = true; + return Result; + } + + AllocatedSizes.emplace_back(AllocSize, Alignment); + } + + // Sort to try to estimate the worst case alignment padding. + llvm::sort(AllocatedSizes, llvm::less_second()); + + uint32_t CurrentLocalMemUsage = 0; + for (auto Alloc : AllocatedSizes) { + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); + CurrentLocalMemUsage += Alloc.first; + } + + unsigned MaxOccupancy = + ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F) + .second; + + unsigned MaxSizeWithWaveCount = + ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) { + Result.currentUsage = CurrentLocalMemUsage; + Result.limit = MaxSizeWithWaveCount; + Result.maxOccupancy = MaxOccupancy; + Result.promotable = false; + return Result; + } + + Result.currentUsage = CurrentLocalMemUsage; + Result.limit = MaxSizeWithWaveCount; + Result.maxOccupancy = MaxOccupancy; + Result.promotable = true; + return Result; +} diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h new file mode 100644 index 0000000000000..2340a5b86c0df --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -0,0 +1,36 @@ +//===-- AMDGPULDSUtils.h - AMDGPU LDS utilities ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Shared helpers for computing LDS usage and limits for an AMDGPU function. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H + +#include + +namespace llvm { + +class Function; +class TargetMachine; + +struct AMDGPULDSBudget { + uint32_t currentUsage = 0; + uint32_t limit = 0; + unsigned maxOccupancy = 0; + bool promotable = false; + bool disabledDueToLocalArg = false; + bool disabledDueToExternDynShared = false; +}; + +AMDGPULDSBudget computeLDSBudget(const Function &F, const TargetMachine &TM); + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt index 7b2200d8bc488..4633763d23f22 100644 --- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_component_library(LLVMAMDGPUUtils AMDGPUDelayedMCExpr.cpp AMDGPUPALMetadata.cpp AMDKernelCodeTUtils.cpp + AMDGPULDSUtils.cpp LINK_COMPONENTS Analysis diff --git a/llvm/test/Transforms/AMDGPU/lds-buffering-basic.ll b/llvm/test/Transforms/AMDGPU/lds-buffering-basic.ll new file mode 100644 index 0000000000000..cfcc7f6fa6c4e --- /dev/null +++ b/llvm/test/Transforms/AMDGPU/lds-buffering-basic.ll @@ -0,0 +1,28 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -passes=amdgpu-lds-buffering -S %s | FileCheck %s + +; Check LDS global creation +; CHECK: @ldsbuf_test.ldsbuf = internal unnamed_addr addrspace(3) global + +; CHECK-LABEL: @ldsbuf_test( +; Ensure the original direct global load is gone before the first memcpy-in +; CHECK-NOT: load <4 x i32>, ptr addrspace(1) %p +; LDS slot computation GEP on the function-scoped LDS global +; CHECK: %[[SLOT:[^ ]+]] = getelementptr inbounds {{.*}}, ptr addrspace(3) @ldsbuf_test.ldsbuf, i32 0, i32 % +; memcpy global -> LDS at the load site +; CHECK: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3){{.*}}%[[SLOT]], ptr addrspace(1){{.*}}%p, i64 16, i1 false) +; Ensure the original direct global store is gone between the two memcpys +; CHECK-NOT: store <4 x i32> +; memcpy LDS -> global at the original store site +; CHECK: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1){{.*}}%p, ptr addrspace(3){{.*}}%[[SLOT]], i64 16, i1 false) + +; Minimal kernel with a single load from AS(1) used once by a +; store back to the same pointer. Should be buffered via LDS. + +define amdgpu_kernel void @ldsbuf_test(ptr addrspace(1) %p) #0 { +entry: + %ld = load <4 x i32>, ptr addrspace(1) %p, align 16 + store <4 x i32> %ld, ptr addrspace(1) %p, align 16 + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-cpu"="gfx950" "uniform-work-group-size"="true" }