diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 67042b700c047..400fa686edc4d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -270,6 +270,21 @@ struct AMDGPUPromoteAllocaToVectorPass
   TargetMachine &TM;
 };
 
+// Buffer selected per-thread global memory through LDS to improve
+// performance in memory-bound kernels. This runs late and is separate
+// from alloca promotion.
+struct AMDGPULDSBufferingPass : PassInfoMixin<AMDGPULDSBufferingPass> {
+  AMDGPULDSBufferingPass(const TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  const TargetMachine &TM;
+};
+
+// Legacy PM wrapper for LDS buffering
+FunctionPass *createAMDGPULDSBufferingLegacyPass();
+void initializeAMDGPULDSBufferingLegacyPass(PassRegistry &);
+
 struct AMDGPUAtomicOptimizerPass : PassInfoMixin<AMDGPUAtomicOptimizerPass> {
   AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl)
       : TM(TM), ScanImpl(ScanImpl) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULDSBuffering.cpp b/llvm/lib/Target/AMDGPU/AMDGPULDSBuffering.cpp
new file mode 100644
index 0000000000000..91e7823a7a277
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULDSBuffering.cpp
@@ -0,0 +1,341 @@
+//===-- AMDGPULDSBuffering.cpp - Per-thread LDS buffering -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass buffers per-thread global memory accesses through LDS
+// (addrspace(3)) to improve performance in memory-bound kernels. The main
+// purpose is to alleviate global memory contention and cache thrashing when
+// the same global pointer is used for both load and store operations.
+//
+// The pass runs late in the pipeline, after SROA and AMDGPUPromoteAlloca,
+// using only leftover LDS budget to avoid interfering with other LDS
+// optimizations. It respects the same LDS budget constraints as
+// AMDGPUPromoteAlloca, ensuring that LDS usage remains within occupancy
+// tier limits.
+//
+// Current implementation handles the simplest pattern: a load from global
+// memory whose only use is a store back to the same pointer. This pattern
+// is transformed into a pair of memcpy operations (global->LDS and
+// LDS->global), effectively moving the value through LDS instead of
+// accessing global memory directly.
+//
+// This pass was inspired by finding that some rocrand performance tests
+// show better performance when global memory is buffered through LDS
+// instead of being loaded/stored to registers directly. This optimization
+// is experimental and must be enabled via the -amdgpu-enable-lds-buffering
+// flag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPULDSUtils.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "amdgpu-lds-buffering"
+
+using namespace llvm;
+
+namespace {
+
+static cl::opt<unsigned>
+    LDSBufferingMaxBytes("amdgpu-lds-buffering-max-bytes",
+                         cl::desc("Max byte size for LDS buffering candidates"),
+                         cl::init(64));
+
+class AMDGPULDSBufferingImpl {
+  const TargetMachine &TM;
+  Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
+  bool IsAMDGCN = false;
+  bool IsAMDHSA = false;
+
+public:
+  AMDGPULDSBufferingImpl(const TargetMachine &TM) : TM(TM) {}
+
+  bool run(Function &F) {
+    LLVM_DEBUG(dbgs() << "[LDSBuffer] Visit function: " << F.getName() << '\n');
+    const Triple &TT = TM.getTargetTriple();
+    if (!TT.isAMDGCN())
+      return false;
+    IsAMDGCN = true;
+    IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+
+    if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      return false;
+
+    Mod = F.getParent();
+    DL = &Mod->getDataLayout();
+
+    auto Budget = computeLDSBudget(F, TM);
+    if (!Budget.promotable)
+      return false;
+    uint32_t localUsage = Budget.currentUsage;
+    uint32_t localLimit = Budget.limit;
+
+    const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
+    unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second;
+
+    bool Changed = false;
+    unsigned NumTransformed = 0;
+
+    // Minimal pattern: a load from AS(1) whose only use is a store back to the
+    // exact same pointer later. Replace with global<->LDS memcpy pair to
+    // shorten the live range and free VGPRs.
+    SmallVector<Instruction *> ToErase;
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : llvm::make_early_inc_range(BB)) {
+        auto *LI = dyn_cast<LoadInst>(&I);
+        if (!LI || LI->isVolatile())
+          continue;
+
+        Type *ValTy = LI->getType();
+        if (!ValTy->isFirstClassType())
+          continue;
+
+        Value *Ptr = LI->getPointerOperand();
+        auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+        if (!PtrTy || PtrTy->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
+          continue;
+
+        if (!LI->hasOneUse())
+          continue;
+        auto *SI = dyn_cast<StoreInst>(LI->user_back());
+        if (!SI || SI->isVolatile())
+          continue;
+        if (SI->getValueOperand() != LI)
+          continue;
+
+        Value *SPtr = SI->getPointerOperand();
+        if (SPtr->stripPointerCasts() != Ptr->stripPointerCasts())
+          continue;
+
+        TypeSize TS = DL->getTypeStoreSize(ValTy);
+        if (TS.isScalable())
+          continue;
+        uint64_t Size = TS.getFixedValue();
+        if (Size == 0 || Size > LDSBufferingMaxBytes)
+          continue;
+        Align LoadAlign = LI->getAlign();
+        Align MinAlign = Align(16);
+        if (LoadAlign < MinAlign)
+          continue;
+
+        // Create LDS slot near the load and emit memcpy global->LDS.
+        LLVM_DEBUG({
+          dbgs() << "[LDSBuffer] Candidate found: load->store same ptr in "
+                 << F.getName() << '\n';
+          dbgs() << "            size=" << Size
+                 << "B, align=" << LoadAlign.value()
+                 << ", ptr AS=" << PtrTy->getAddressSpace() << "\n";
+        });
+        IRBuilder<> BLoad(LI);
+        Align Alignment = LoadAlign;
+
+        // Ensure LDS budget allows allocating a per-thread slot.
+        uint32_t NewSize = alignTo(localUsage, Alignment);
+        NewSize += WorkGroupSize * static_cast<uint32_t>(Size);
+        if (NewSize > localLimit)
+          continue;
+        localUsage = NewSize;
+        auto [GV, SlotPtr] =
+            createLDSGlobalAndThreadSlot(F, ValTy, Alignment, "ldsbuf", BLoad);
+        // memcpy p3 <- p1
+        LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy global->LDS: "
+                          << GV->getName() << ", bytes=" << Size
+                          << ", align=" << Alignment.value() << '\n');
+        BLoad.CreateMemCpy(SlotPtr, Alignment, Ptr, Alignment, TS);
+
+        // Replace the final store with memcpy LDS->global.
+        IRBuilder<> BStore(SI);
+        LLVM_DEBUG(dbgs() << "[LDSBuffer] Insert memcpy LDS->global: "
+                          << GV->getName() << ", bytes=" << Size
+                          << ", align=" << Alignment.value() << '\n');
+        BStore.CreateMemCpy(SPtr, Alignment, SlotPtr, Alignment, TS);
+
+        ToErase.push_back(SI);
+        ToErase.push_back(LI);
+        LLVM_DEBUG(dbgs() << "[LDSBuffer] Erase original load/store pair\n");
+        Changed = true;
+        ++NumTransformed;
+      }
+    }
+
+    for (Instruction *E : ToErase)
+      E->eraseFromParent();
+
+    LLVM_DEBUG(dbgs() << "[LDSBuffer] Transformations applied: "
+                      << NumTransformed << "\n");
+
+    return Changed;
+  }
+
+private:
+  // Get local size Y and Z from the dispatch packet on HSA.
+  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder) {
+    Function &F = *Builder.GetInsertBlock()->getParent();
+    const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
+
+    CallInst *DispatchPtr =
+        Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
+    DispatchPtr->addRetAttr(Attribute::NoAlias);
+    DispatchPtr->addRetAttr(Attribute::NonNull);
+    F.removeFnAttr("amdgpu-no-dispatch-ptr");
+    DispatchPtr->addDereferenceableRetAttr(64);
+
+    Type *I32Ty = Type::getInt32Ty(Mod->getContext());
+    Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
+    LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
+    Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
+    LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
+    MDNode *MD = MDNode::get(Mod->getContext(), {});
+    LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
+    LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
+    ST.makeLIDRangeMetadata(LoadZU);
+    Value *Y = Builder.CreateLShr(LoadXY, 16);
+    return std::pair(Y, LoadZU);
+  }
+
+  // Get workitem id for dimension N (0,1,2).
+  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+    Function *F = Builder.GetInsertBlock()->getParent();
+    const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
+    Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+    StringRef AttrName;
+    switch (N) {
+    case 0:
+      IntrID = Intrinsic::amdgcn_workitem_id_x;
+      AttrName = "amdgpu-no-workitem-id-x";
+      break;
+    case 1:
+      IntrID = Intrinsic::amdgcn_workitem_id_y;
+      AttrName = "amdgpu-no-workitem-id-y";
+      break;
+    case 2:
+      IntrID = Intrinsic::amdgcn_workitem_id_z;
+      AttrName = "amdgpu-no-workitem-id-z";
+      break;
+    default:
+      llvm_unreachable("invalid dimension");
+    }
+    Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
+    CallInst *CI = Builder.CreateCall(WorkitemIdFn);
+    ST.makeLIDRangeMetadata(CI);
+    F->removeFnAttr(AttrName);
+    return CI;
+  }
+
+  // Compute linear thread id within a workgroup.
+  Value *buildLinearThreadId(IRBuilder<> &Builder) {
+    Value *TCntY, *TCntZ;
+    std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
+    Value *TIdX = getWorkitemID(Builder, 0);
+    Value *TIdY = getWorkitemID(Builder, 1);
+    Value *TIdZ = getWorkitemID(Builder, 2);
+    Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
+    Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+    Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
+    Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+    TID = Builder.CreateAdd(TID, TIdZ);
+    return TID;
+  }
+
+  // Create an LDS array [WGSize x ElemTy] and return pointer to per-thread
+  // slot.
+  std::pair<GlobalVariable *, Value *>
+  createLDSGlobalAndThreadSlot(Function &F, Type *ElemTy, Align Alignment,
+                               StringRef BaseName, IRBuilder<> &Builder) {
+    const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
+    unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second;
+    Type *ArrTy = ArrayType::get(ElemTy, WorkGroupSize);
+    GlobalVariable *GV = new GlobalVariable(
+        *Mod, ArrTy, /*isConstant=*/false, GlobalValue::InternalLinkage,
+        PoisonValue::get(ArrTy), (F.getName() + "." + BaseName).str(), nullptr,
+        GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+    GV->setAlignment(Alignment);
+
+    LLVM_DEBUG({
+      dbgs() << "[LDSBuffer] Create LDS global: name=" << GV->getName()
+             << ", elemTy=" << *ElemTy << ", WGSize=" << WorkGroupSize
+             << ", align=" << Alignment.value() << '\n';
+    });
+
+    Value *LinearTID = buildLinearThreadId(Builder);
+    LLVMContext &Ctx = Mod->getContext();
+    Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Ctx)),
+                        LinearTID};
+    Value *SlotPtr = Builder.CreateInBoundsGEP(ArrTy, GV, Indices);
+    return {GV, SlotPtr};
+  }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses AMDGPULDSBufferingPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  bool Changed = AMDGPULDSBufferingImpl(TM).run(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+//===----------------------------------------------------------------------===//
+// Legacy PM wrapper
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AMDGPULDSBufferingLegacy : public FunctionPass {
+public:
+  static char ID;
+  AMDGPULDSBufferingLegacy() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "AMDGPU LDS Buffering"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+      return AMDGPULDSBufferingImpl(TPC->getTM<TargetMachine>()).run(F);
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPULDSBufferingLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPULDSBufferingLegacy, DEBUG_TYPE,
+                      "AMDGPU per-thread LDS buffering", false, false)
+INITIALIZE_PASS_END(AMDGPULDSBufferingLegacy, DEBUG_TYPE,
+                    "AMDGPU per-thread LDS buffering", false, false)
+
+FunctionPass *llvm::createAMDGPULDSBufferingLegacyPass() {
+  return new AMDGPULDSBufferingLegacy();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index bf6f1a9dbf576..45eb503bb981e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -60,6 +60,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
+FUNCTION_PASS("amdgpu-lds-buffering", AMDGPULDSBufferingPass(*this))
 FUNCTION_PASS("amdgpu-promote-kernel-arguments",
               AMDGPUPromoteKernelArgumentsPass())
 FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ddabd25894414..c5073d57618d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -28,6 +28,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPULDSUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
@@ -1350,129 +1351,23 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
 }
 
 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
-
-  FunctionType *FTy = F.getFunctionType();
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
-
-  // If the function has any arguments in the local address space, then it's
-  // possible these arguments require the entire local memory space, so
-  // we cannot use local memory in the pass.
-  for (Type *ParamTy : FTy->params()) {
-    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      LocalMemLimit = 0;
+  AMDGPULDSBudget Budget = computeLDSBudget(F, TM);
+  CurrentLocalMemUsage = Budget.currentUsage;
+  LocalMemLimit = Budget.limit;
+  if (!Budget.promotable) {
+    if (Budget.disabledDueToLocalArg) {
       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
                            "local memory disabled.\n");
-      return false;
-    }
-  }
-
-  LocalMemLimit = ST.getAddressableLocalMemorySize();
-  if (LocalMemLimit == 0)
-    return false;
-
-  SmallVector<const Constant *, 16> Stack;
-  SmallPtrSet<const Constant *, 8> VisitedConstants;
-  SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
-
-  auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
-    for (const User *U : Val->users()) {
-      if (const Instruction *Use = dyn_cast<Instruction>(U)) {
-        if (Use->getParent()->getParent() == &F)
-          return true;
-      } else {
-        const Constant *C = cast<Constant>(U);
-        if (VisitedConstants.insert(C).second)
-          Stack.push_back(C);
-      }
-    }
-
-    return false;
-  };
-
-  for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
-      continue;
-
-    if (visitUsers(&GV, &GV)) {
-      UsedLDS.insert(&GV);
-      Stack.clear();
-      continue;
-    }
-
-    // For any ConstantExpr uses, we need to recursively search the users until
-    // we see a function.
-    while (!Stack.empty()) {
-      const Constant *C = Stack.pop_back_val();
-      if (visitUsers(&GV, C)) {
-        UsedLDS.insert(&GV);
-        Stack.clear();
-        break;
-      }
-    }
-  }
-
-  const DataLayout &DL = Mod->getDataLayout();
-  SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
-  AllocatedSizes.reserve(UsedLDS.size());
-
-  for (const GlobalVariable *GV : UsedLDS) {
-    Align Alignment =
-        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
-    uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
-
-    // HIP uses an extern unsized array in local address space for dynamically
-    // allocated shared memory.  In that case, we have to disable the promotion.
-    if (GV->hasExternalLinkage() && AllocSize == 0) {
-      LocalMemLimit = 0;
+    } else if (Budget.disabledDueToExternDynShared) {
       LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
                            "local memory. Promoting to local memory "
                            "disabled.\n");
-      return false;
     }
-
-    AllocatedSizes.emplace_back(AllocSize, Alignment);
-  }
-
-  // Sort to try to estimate the worst case alignment padding
-  //
-  // FIXME: We should really do something to fix the addresses to a more optimal
-  // value instead
-  llvm::sort(AllocatedSizes, llvm::less_second());
-
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
-
-  // FIXME: Try to account for padding here. The real padding and address is
-  // currently determined from the inverse order of uses in the function when
-  // legalizing, which could also potentially change. We try to estimate the
-  // worst case here, but we probably should fix the addresses earlier.
-  for (auto Alloc : AllocatedSizes) {
-    CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
-    CurrentLocalMemUsage += Alloc.first;
-  }
-
-  unsigned MaxOccupancy =
-      ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
-          .second;
-
-  // Round up to the next tier of usage.
-  unsigned MaxSizeWithWaveCount =
-      ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
-
-  // Program may already use more LDS than is usable at maximum occupancy.
-  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
     return false;
-
-  LocalMemLimit = MaxSizeWithWaveCount;
+  }
 
   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
-                    << " bytes of LDS\n"
-                    << "  Rounding size to " << MaxSizeWithWaveCount
-                    << " with a maximum occupancy of " << MaxOccupancy << '\n'
-                    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
-                    << " available for promotion\n");
-
+                    << " bytes of LDS\n");
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b87b54ffc4f12..71e9e4f846f03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -491,6 +491,12 @@ static cl::opt<bool> EnableImageIntrinsicOptimizer(
     cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
     cl::Hidden);
 
+// Gate insertion of the AMDGPU LDS Buffering pass into the default pipeline.
+static cl::opt<bool> EnableLDSBuffering(
+    "amdgpu-enable-lds-buffering",
+    cl::desc("Enable AMDGPU LDS Buffering pass in the default pipeline"),
+    cl::init(false), cl::Hidden);
+
 static cl::opt<bool>
     EnableLoopPrefetch("amdgpu-loop-prefetch",
                        cl::desc("Enable loop data prefetch on AMDGPU"),
@@ -579,6 +585,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
   initializeAMDGPURegBankCombinerPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
+  initializeAMDGPULDSBufferingLegacyPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
   initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
@@ -1353,6 +1360,9 @@ void AMDGPUPassConfig::addIRPasses() {
 
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     addPass(createAMDGPUPromoteAlloca());
+    // Run per-thread LDS buffering after promote-alloca to use leftover LDS.
+    if (EnableLDSBuffering)
+      addPass(createAMDGPULDSBufferingLegacyPass());
 
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses();
@@ -2096,6 +2106,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
 
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     addPass(AMDGPUPromoteAllocaPass(TM));
+    // Run per-thread LDS buffering after promote-alloca to use leftover LDS.
+    if (EnableLDSBuffering)
+      addPass(AMDGPULDSBufferingPass(TM));
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses(addPass);
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e5293c706..aa97a2106108d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -80,6 +80,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
+  AMDGPULDSBuffering.cpp
   AMDGPUPrepareAGPRAlloc.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
new file mode 100644
index 0000000000000..b7b51056a332b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -0,0 +1,146 @@
+//===-- AMDGPULDSUtils.cpp - AMDGPU LDS utilities ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Shared helpers for computing LDS usage and limits for an AMDGPU function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils/AMDGPULDSUtils.h"
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Alignment.h"
+
+using namespace llvm;
+
+AMDGPULDSBudget llvm::computeLDSBudget(const Function &F,
+                                       const TargetMachine &TM) {
+  AMDGPULDSBudget Result;
+
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
+  const Module *M = F.getParent();
+  const DataLayout &DL = M->getDataLayout();
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory.
+  FunctionType *FTy = F.getFunctionType();
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      Result.limit = 0;
+      Result.promotable = false;
+      Result.disabledDueToLocalArg = true;
+      return Result;
+    }
+  }
+
+  uint32_t LocalMemLimit = ST.getAddressableLocalMemorySize();
+  if (LocalMemLimit == 0) {
+    Result.limit = 0;
+    Result.promotable = false;
+    return Result;
+  }
+
+  SmallVector<const Constant *, 16> Stack;
+  SmallPtrSet<const Constant *, 8> VisitedConstants;
+  SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
+
+  auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
+    for (const User *U : Val->users()) {
+      if (const Instruction *Use = dyn_cast<Instruction>(U)) {
+        if (Use->getParent()->getParent() == &F)
+          return true;
+      } else {
+        const Constant *C = cast<Constant>(U);
+        if (VisitedConstants.insert(C).second)
+          Stack.push_back(C);
+      }
+    }
+    return false;
+  };
+
+  for (const GlobalVariable &GV : M->globals()) {
+    if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      continue;
+
+    if (visitUsers(&GV, &GV)) {
+      UsedLDS.insert(&GV);
+      Stack.clear();
+      continue;
+    }
+
+    while (!Stack.empty()) {
+      const Constant *C = Stack.pop_back_val();
+      if (visitUsers(&GV, C)) {
+        UsedLDS.insert(&GV);
+        Stack.clear();
+        break;
+      }
+    }
+  }
+
+  SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
+  AllocatedSizes.reserve(UsedLDS.size());
+
+  for (const GlobalVariable *GV : UsedLDS) {
+    Align Alignment =
+        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+    uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+
+    // HIP uses an extern unsized array in local address space for dynamically
+    // allocated shared memory.
+    if (GV->hasExternalLinkage() && AllocSize == 0) {
+      Result.limit = 0;
+      Result.promotable = false;
+      Result.disabledDueToExternDynShared = true;
+      return Result;
+    }
+
+    AllocatedSizes.emplace_back(AllocSize, Alignment);
+  }
+
+  // Sort to try to estimate the worst case alignment padding.
+  llvm::sort(AllocatedSizes, llvm::less_second());
+
+  uint32_t CurrentLocalMemUsage = 0;
+  for (auto Alloc : AllocatedSizes) {
+    CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
+    CurrentLocalMemUsage += Alloc.first;
+  }
+
+  unsigned MaxOccupancy =
+      ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
+          .second;
+
+  unsigned MaxSizeWithWaveCount =
+      ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount) {
+    Result.currentUsage = CurrentLocalMemUsage;
+    Result.limit = MaxSizeWithWaveCount;
+    Result.maxOccupancy = MaxOccupancy;
+    Result.promotable = false;
+    return Result;
+  }
+
+  Result.currentUsage = CurrentLocalMemUsage;
+  Result.limit = MaxSizeWithWaveCount;
+  Result.maxOccupancy = MaxOccupancy;
+  Result.promotable = true;
+  return Result;
+}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
new file mode 100644
index 0000000000000..2340a5b86c0df
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -0,0 +1,36 @@
+//===-- AMDGPULDSUtils.h - AMDGPU LDS utilities ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Shared helpers for computing LDS usage and limits for an AMDGPU function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
+
+#include <cstdint>
+
+namespace llvm {
+
+class Function;
+class TargetMachine;
+
+struct AMDGPULDSBudget {
+  uint32_t currentUsage = 0;
+  uint32_t limit = 0;
+  unsigned maxOccupancy = 0;
+  bool promotable = false;
+  bool disabledDueToLocalArg = false;
+  bool disabledDueToExternDynShared = false;
+};
+
+AMDGPULDSBudget computeLDSBudget(const Function &F, const TargetMachine &TM);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 7b2200d8bc488..4633763d23f22 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_component_library(LLVMAMDGPUUtils
   AMDGPUDelayedMCExpr.cpp
   AMDGPUPALMetadata.cpp
   AMDKernelCodeTUtils.cpp
+  AMDGPULDSUtils.cpp
 
   LINK_COMPONENTS
   Analysis
diff --git a/llvm/test/Transforms/AMDGPU/lds-buffering-basic.ll b/llvm/test/Transforms/AMDGPU/lds-buffering-basic.ll
new file mode 100644
index 0000000000000..cfcc7f6fa6c4e
--- /dev/null
+++ b/llvm/test/Transforms/AMDGPU/lds-buffering-basic.ll
@@ -0,0 +1,28 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -passes=amdgpu-lds-buffering -S %s | FileCheck %s
+
+; Check LDS global creation
+; CHECK: @ldsbuf_test.ldsbuf = internal unnamed_addr addrspace(3) global
+
+; CHECK-LABEL: @ldsbuf_test(
+; Ensure the original direct global load is gone before the first memcpy-in
+; CHECK-NOT: load <4 x i32>, ptr addrspace(1) %p
+; LDS slot computation GEP on the function-scoped LDS global
+; CHECK: %[[SLOT:[^ ]+]] = getelementptr inbounds {{.*}}, ptr addrspace(3) @ldsbuf_test.ldsbuf, i32 0, i32 %
+; memcpy global -> LDS at the load site
+; CHECK: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3){{.*}}%[[SLOT]], ptr addrspace(1){{.*}}%p, i64 16, i1 false)
+; Ensure the original direct global store is gone between the two memcpys
+; CHECK-NOT: store <4 x i32>
+; memcpy LDS -> global at the original store site
+; CHECK: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1){{.*}}%p, ptr addrspace(3){{.*}}%[[SLOT]], i64 16, i1 false)
+
+; Minimal kernel with a single load from AS(1) used once by a
+; store back to the same pointer. Should be buffered via LDS.
+
+define amdgpu_kernel void @ldsbuf_test(ptr addrspace(1) %p) #0 {
+entry:
+  %ld = load <4 x i32>, ptr addrspace(1) %p, align 16
+  store <4 x i32> %ld, ptr addrspace(1) %p, align 16
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "target-cpu"="gfx950" "uniform-work-group-size"="true" }