From d18623a3ebed2bf2a968dd766e4400c9fd2250f6 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Thu, 30 Oct 2025 14:51:45 +0530
Subject: [PATCH 01/16] [AMDGPU] Add amdgpu-lower-special-lds pass to lower
 named-barrier LDS

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   9 +
 .../Target/AMDGPU/AMDGPULowerSpecialLDS.cpp   | 231 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   1 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../AMDGPU/amdgpu-lower-special-lds.ll        |  67 +++++
 6 files changed, 310 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 67042b700c047..802cf0b40e5e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
   bool GlobalOpt;
 };
 
+void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &);
+extern char &AMDGPULowerSpecialLDSLegacyPassID;
+ModulePass *createAMDGPULowerSpecialLDSLegacyPass();
+
+struct AMDGPULowerSpecialLDSPass : PassInfoMixin<AMDGPULowerSpecialLDSPass> {
+  AMDGPULowerSpecialLDSPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
 extern char &AMDGPUSwLowerLDSLegacyPassID;
 ModulePass *
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
new file mode 100644
index 0000000000000..56161dacc49e7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -0,0 +1,231 @@
+//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the named barriers LDS globals which needs
+// special address assignment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-special-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                           Function *KF) {
+  bool NeedsReplacement = false;
+  for (Use &U : GV->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (isKernelLDS(F) && F != KF) {
+        NeedsReplacement = true;
+        break;
+      }
+    }
+  }
+  if (!NeedsReplacement)
+    return GV;
+  // Create a new GV used only by this kernel and its function
+  GlobalVariable *NewGV = new GlobalVariable(
+      M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+      GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+      GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  for (Use &U : make_early_inc_range(GV->uses())) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (!isKernelLDS(F) || F == KF) {
+        U.getUser()->replaceUsesOfWith(GV, NewGV);
+      }
+    }
+  }
+  return NewGV;
+}
+
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+                                     uint32_t Address) {
+  // Write the specified address into metadata where it can be retrieved by
+  // the assembler. Format is a half open range, [Address Address+1)
+  LLVMContext &Ctx = M->getContext();
+  auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+  auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+  auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+  GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                  MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+  llvm::sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();
+  });
+  return {std::move(V)};
+}
+
+bool lowerSpecialLDSVariables(
+    Module &M, LDSUsesInfoTy &LDSUsesInfo,
+    VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+  // The 1st round: give module-absolute assignments
+  int NumAbsolutes = 0;
+  std::vector<GlobalVariable *> OrderedGVs;
+  for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+    GlobalVariable *GV = K.first;
+    if (!isNamedBarrier(*GV))
+      continue;
+    // give a module-absolute assignment if it is indirectly accessed by
+    // multiple kernels. This is not precise, but we don't want to duplicate
+    // a function when it is called by multiple kernels.
+    if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+      OrderedGVs.push_back(GV);
+    } else {
+      // leave it to the 2nd round, which will give a kernel-relative
+      // assignment if it is only indirectly accessed by one kernel
+      LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+    }
+    LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+  }
+  OrderedGVs = sortByName(std::move(OrderedGVs));
+  for (GlobalVariable *GV : OrderedGVs) {
+    unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+    unsigned BarId = NumAbsolutes + 1;
+    unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+    NumAbsolutes += BarCnt;
+
+    // 4 bits for alignment, 5 bits for the barrier num,
+    // 3 bits for the barrier scope
+    unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+    recordLDSAbsoluteAddress(&M, GV, Offset);
+  }
+  OrderedGVs.clear();
+
+  // The 2nd round: give a kernel-relative assignment for GV that
+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  std::vector<Function *> OrderedKernels;
+  for (auto &K : LDSUsesInfo.direct_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    OrderedKernels.push_back(F);
+  }
+  OrderedKernels = sortByName(std::move(OrderedKernels));
+
+  llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+  for (Function *F : OrderedKernels) {
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+      if (!isNamedBarrier(*GV))
+        continue;
+
+      LDSUsesInfo.direct_access[F].erase(GV);
+      if (GV->isAbsoluteSymbolRef()) {
+        // already assigned
+        continue;
+      }
+      OrderedGVs.push_back(GV);
+    }
+    OrderedGVs = sortByName(std::move(OrderedGVs));
+    for (GlobalVariable *GV : OrderedGVs) {
+      // GV could also be used directly by other kernels. If so, we need to
+      // create a new GV used only by this kernel and its function.
+      auto NewGV = uniquifyGVPerKernel(M, GV, F);
+      Changed |= (NewGV != GV);
+      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarId = Kernel2BarId[F];
+      BarId += NumAbsolutes + 1;
+      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+      Kernel2BarId[F] += BarCnt;
+      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+      recordLDSAbsoluteAddress(&M, NewGV, Offset);
+    }
+    OrderedGVs.clear();
+  }
+  // Also erase those special LDS variables from indirect_access.
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    assert(isKernelLDS(K.first));
+    for (GlobalVariable *GV : K.second) {
+      if (isNamedBarrier(*GV))
+        K.second.erase(GV);
+    }
+  }
+  return Changed;
+}
+
+bool runLowerSpecialLDS(Module &M) {
+  CallGraph CG = CallGraph(M);
+  bool Changed = false;
+  Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+  // For each kernel, what variables does it access directly or through
+  // callees
+  LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+  // For each variable accessed through callees, which kernels access it
+  VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    for (GlobalVariable *GV : K.second) {
+      LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+    }
+  }
+
+  if (LDSUsesInfo.HasSpecialGVs) {
+    // Special LDS variables need special address assignment
+    Changed |= lowerSpecialLDSVariables(
+        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+  }
+  return Changed;
+}
+
+class AMDGPULowerSpecialLDSLegacy : public ModulePass {
+public:
+  static char ID;
+  AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override;
+};
+} // namespace
+
+char AMDGPULowerSpecialLDSLegacy::ID = 0;
+char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+                      "AMDGPU lowering of special LDS variables", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+                    "AMDGPU lowering of special LDS variables", false, false)
+
+bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) {
+  return runLowerSpecialLDS(M);
+}
+
+ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() {
+  return new AMDGPULowerSpecialLDSLegacy();
+}
+
+PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  return runLowerSpecialLDS(M) ? PreservedAnalyses::none()
+                               : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index bf6f1a9dbf576..a2fd53ac1b8ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-perf-hint",
 MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass())
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 #undef MODULE_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b87b54ffc4f12..01495a3708ce3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -567,6 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILoadStoreOptimizerLegacyPass(*PR);
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
+  initializeAMDGPULowerSpecialLDSLegacyPass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e5293c706..c401926e22a87 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
   AMDGPUPrepareAGPRAlloc.cpp
+  AMDGPULowerSpecialLDS.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
new file mode 100644
index 0000000000000..28d94f3d42622
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+
+@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
+; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+
+define void @func1() {
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define void @func2() {
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier()
+    call void @func1()
+    call void @func2()
+    ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+    call void @func2()
+    ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+; CHECK: !0 = !{i32 8396816, i32 8396817}
+; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
+; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}

From 5bdcb2b5f8d08f2ac3b59045860c9e29c6f24cb2 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Mon, 3 Nov 2025 12:43:02 +0530
Subject: [PATCH 02/16] Add comments

---
 llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
index 56161dacc49e7..5534a3ba6382e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/InitializePasses.h"
@@ -33,6 +32,8 @@ using namespace AMDGPU;
 
 namespace {
 
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
 static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
                                            Function *KF) {
   bool NeedsReplacement = false;
@@ -64,10 +65,10 @@ static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
   return NewGV;
 }
 
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
 static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
                                      uint32_t Address) {
-  // Write the specified address into metadata where it can be retrieved by
-  // the assembler. Format is a half open range, [Address Address+1)
   LLVMContext &Ctx = M->getContext();
   auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
   auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
@@ -83,7 +84,8 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
   return {std::move(V)};
 }
 
-bool lowerSpecialLDSVariables(
+// Main utility function for special LDS variables lowering.
+static bool lowerSpecialLDSVariables(
     Module &M, LDSUsesInfoTy &LDSUsesInfo,
     VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
   bool Changed = false;
@@ -172,7 +174,7 @@ bool lowerSpecialLDSVariables(
   return Changed;
 }
 
-bool runLowerSpecialLDS(Module &M) {
+static bool runLowerSpecialLDS(Module &M) {
   CallGraph CG = CallGraph(M);
   bool Changed = false;
   Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
@@ -205,6 +207,7 @@ class AMDGPULowerSpecialLDSLegacy : public ModulePass {
   AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
   bool runOnModule(Module &M) override;
 };
+
 } // namespace
 
 char AMDGPULowerSpecialLDSLegacy::ID = 0;

From 9b224e85b1da9ba5f3118dca60bffc7fc4cbf008 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Wed, 5 Nov 2025 15:19:31 +0530
Subject: [PATCH 03/16] Elaborate description and namespace changes

---
 .../Target/AMDGPU/AMDGPULowerSpecialLDS.cpp   | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
index 5534a3ba6382e..ae869a841f7f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -6,8 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass lowers the named barriers LDS globals which needs
-// special address assignment.
+// This pass performs lowering of LDS global variables with target extension
+// type "amdgpu.named.barrier" that require specialized address assignment. It
+// assigns a unique barrier identifier to each named-barrier LDS variable and
+// encodes this identifier within the !absolute_symbol metadata of that global.
+// This encoding ensures that subsequent LDS lowering passes can process these
+// barriers correctly without conflicts.
 //
 //===----------------------------------------------------------------------===//
 
@@ -77,8 +81,8 @@ static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
                   MDNode::get(Ctx, {MinC, MaxC}));
 }
 
-template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
-  llvm::sort(V, [](const auto *L, const auto *R) {
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+  sort(V, [](const auto *L, const auto *R) {
     return L->getName() < R->getName();
   });
   return {std::move(V)};
@@ -92,7 +96,7 @@ static bool lowerSpecialLDSVariables(
   const DataLayout &DL = M.getDataLayout();
   // The 1st round: give module-absolute assignments
   int NumAbsolutes = 0;
-  std::vector<GlobalVariable *> OrderedGVs;
+  SmallVector<GlobalVariable *> OrderedGVs;
   for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
     GlobalVariable *GV = K.first;
     if (!isNamedBarrier(*GV))
@@ -111,7 +115,7 @@ static bool lowerSpecialLDSVariables(
   }
   OrderedGVs = sortByName(std::move(OrderedGVs));
   for (GlobalVariable *GV : OrderedGVs) {
-    unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+    unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
     unsigned BarId = NumAbsolutes + 1;
     unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
     NumAbsolutes += BarCnt;
@@ -126,7 +130,7 @@ static bool lowerSpecialLDSVariables(
   // The 2nd round: give a kernel-relative assignment for GV that
   // either only indirectly accessed by single kernel or only directly
   // accessed by multiple kernels.
-  std::vector<Function *> OrderedKernels;
+  SmallVector<Function *> OrderedKernels;
   for (auto &K : LDSUsesInfo.direct_access) {
     Function *F = K.first;
     assert(isKernelLDS(F));
@@ -134,7 +138,7 @@ static bool lowerSpecialLDSVariables(
   }
   OrderedKernels = sortByName(std::move(OrderedKernels));
 
-  llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+  DenseMap<Function *, uint32_t> Kernel2BarId;
   for (Function *F : OrderedKernels) {
     for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
       if (!isNamedBarrier(*GV))
@@ -153,7 +157,7 @@ static bool lowerSpecialLDSVariables(
       // create a new GV used only by this kernel and its function.
       auto NewGV = uniquifyGVPerKernel(M, GV, F);
       Changed |= (NewGV != GV);
-      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
       unsigned BarId = Kernel2BarId[F];
       BarId += NumAbsolutes + 1;
       unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;

From f30faccec86c4e91e05db9017515d4e830ef6a10 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Wed, 5 Nov 2025 21:06:43 +0530
Subject: [PATCH 04/16] Rename pass to amdgpu-lower-exec-sync

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               | 10 ++--
 ...SpecialLDS.cpp => AMDGPULowerExecSync.cpp} | 47 ++++++++++---------
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  2 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |  2 +-
 ...ecial-lds.ll => amdgpu-lower-exec-sync.ll} |  2 +-
 6 files changed, 33 insertions(+), 32 deletions(-)
 rename llvm/lib/Target/AMDGPU/{AMDGPULowerSpecialLDS.cpp => AMDGPULowerExecSync.cpp} (84%)
 rename llvm/test/CodeGen/AMDGPU/{amdgpu-lower-special-lds.ll => amdgpu-lower-exec-sync.ll} (97%)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 802cf0b40e5e9..5af2a2755cec3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -298,12 +298,12 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
   bool GlobalOpt;
 };
 
-void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &);
-extern char &AMDGPULowerSpecialLDSLegacyPassID;
-ModulePass *createAMDGPULowerSpecialLDSLegacyPass();
+void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &);
+extern char &AMDGPULowerExecSyncLegacyPassID;
+ModulePass *createAMDGPULowerExecSyncLegacyPass();
 
-struct AMDGPULowerSpecialLDSPass : PassInfoMixin<AMDGPULowerSpecialLDSPass> {
-  AMDGPULowerSpecialLDSPass() {}
+struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> {
+  AMDGPULowerExecSyncPass() {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
similarity index 84%
rename from llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
rename to llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index ae869a841f7f0..4b640362e8887 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//
+//===-- AMDGPULowerExecSync.cpp -----------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass performs lowering of LDS global variables with target extension
-// type "amdgpu.named.barrier" that require specialized address assignment. It
-// assigns a unique barrier identifier to each named-barrier LDS variable and
-// encodes this identifier within the !absolute_symbol metadata of that global.
+// AMDGPU Lower Execution Synchronization pass performs lowering of
+// LDS global variables with target extension type "amdgpu.named.barrier"
+// that require specialized address assignment. It assigns a unique
+// barrier identifier to each named-barrier LDS variable and encodes
+// this identifier within the !absolute_symbol metadata of that global.
 // This encoding ensures that subsequent LDS lowering passes can process these
 // barriers correctly without conflicts.
 //
@@ -29,7 +30,7 @@
 
 #include <algorithm>
 
-#define DEBUG_TYPE "amdgpu-lower-special-lds"
+#define DEBUG_TYPE "amdgpu-lower-exec-sync"
 
 using namespace llvm;
 using namespace AMDGPU;
@@ -89,7 +90,7 @@ template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
 }
 
 // Main utility function for special LDS variables lowering.
-static bool lowerSpecialLDSVariables(
+static bool lowerExecSyncGlobalVariables(
     Module &M, LDSUsesInfoTy &LDSUsesInfo,
     VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
   bool Changed = false;
@@ -178,7 +179,7 @@ static bool lowerSpecialLDSVariables(
   return Changed;
 }
 
-static bool runLowerSpecialLDS(Module &M) {
+static bool runLowerExecSyncGlobals(Module &M) {
   CallGraph CG = CallGraph(M);
   bool Changed = false;
   Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
@@ -199,40 +200,40 @@ static bool runLowerSpecialLDS(Module &M) {
 
   if (LDSUsesInfo.HasSpecialGVs) {
     // Special LDS variables need special address assignment
-    Changed |= lowerSpecialLDSVariables(
+    Changed |= lowerExecSyncGlobalVariables(
         M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
   }
   return Changed;
 }
 
-class AMDGPULowerSpecialLDSLegacy : public ModulePass {
+class AMDGPULowerExecSyncLegacy : public ModulePass {
 public:
   static char ID;
-  AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
+  AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
   bool runOnModule(Module &M) override;
 };
 
 } // namespace
 
-char AMDGPULowerSpecialLDSLegacy::ID = 0;
-char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID;
+char AMDGPULowerExecSyncLegacy::ID = 0;
+char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
                       "AMDGPU lowering of special LDS variables", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
                     "AMDGPU lowering of special LDS variables", false, false)
 
-bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) {
-  return runLowerSpecialLDS(M);
+bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
+  return runLowerExecSyncGlobals(M);
 }
 
-ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() {
-  return new AMDGPULowerSpecialLDSLegacy();
+ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
+  return new AMDGPULowerExecSyncLegacy();
 }
 
-PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M,
-                                                 ModuleAnalysisManager &AM) {
-  return runLowerSpecialLDS(M) ? PreservedAnalyses::none()
-                               : PreservedAnalyses::all();
+PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
+                                    : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a2fd53ac1b8ef..46d70c257b75e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,7 +29,7 @@ MODULE_PASS("amdgpu-perf-hint",
 MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
-MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass())
+MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass())
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 #undef MODULE_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 01495a3708ce3..e360a6db3ad78 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -567,7 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILoadStoreOptimizerLegacyPass(*PR);
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
-  initializeAMDGPULowerSpecialLDSLegacyPass(*PR);
+  initializeAMDGPULowerExecSyncLegacyPass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c401926e22a87..0240315eb7066 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,7 +81,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
   AMDGPUPrepareAGPRAlloc.cpp
-  AMDGPULowerSpecialLDS.cpp
+  AMDGPULowerExecSync.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
similarity index 97%
rename from llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
rename to llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
index 28d94f3d42622..9ba9f41d30ffe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 

From e8eb89b1544eaf166bce2924e0190c2206bcf8b3 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Wed, 5 Nov 2025 21:35:12 +0530
Subject: [PATCH 05/16] Fix message for legacy pass

---
 llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index 4b640362e8887..d7124de0a6fd6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -219,10 +219,10 @@ char AMDGPULowerExecSyncLegacy::ID = 0;
 char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
 
 INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                      "AMDGPU lowering of special LDS variables", false, false)
+                      "AMDGPU lowering of execution synchronization globals", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                    "AMDGPU lowering of special LDS variables", false, false)
+                    "AMDGPU lowering of execution synchronization globals", false, false)
 
 bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
   return runLowerExecSyncGlobals(M);

From 5fac212c00bfd21d2dcef0535d409e3f372df855 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Wed, 5 Nov 2025 22:00:14 +0530
Subject: [PATCH 06/16] Fix format issue

---
 llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index d7124de0a6fd6..3343d594f47ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -219,10 +219,12 @@ char AMDGPULowerExecSyncLegacy::ID = 0;
 char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
 
 INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                      "AMDGPU lowering of execution synchronization globals", false, false)
+                      "AMDGPU lowering of execution synchronization globals",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                    "AMDGPU lowering of execution synchronization globals", false, false)
+                    "AMDGPU lowering of execution synchronization globals",
+                    false, false)
 
 bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
   return runLowerExecSyncGlobals(M);

From 8277767d2bdc2c481c93c20097a1be794f037fdb Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Sun, 9 Nov 2025 10:54:44 +0530
Subject: [PATCH 07/16] autogenerate test amdgpu-lower-exec-sync.ll

---
 .../CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll  | 100 ++++++++++++------
 1 file changed, 68 insertions(+), 32 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
index 9ba9f41d30ffe..782d94845a358 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
@@ -6,45 +7,75 @@
 @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
 @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
 
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
-; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
-; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
-; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
-
+;.
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+;.
 define void @func1() {
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-    call void @llvm.amdgcn.s.barrier.wait(i16 1)
-    ret void
+; CHECK-LABEL: define void @func1() {
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  ret void
 }
 
 define void @func2() {
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-    call void @llvm.amdgcn.s.barrier.wait(i16 1)
-    ret void
+; CHECK-LABEL: define void @func2() {
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  ret void
 }
 
 define amdgpu_kernel void @kernel1() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-    call void @llvm.amdgcn.s.barrier.wait(i16 1)
-    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
-    call void @llvm.amdgcn.s.barrier()
-    call void @func1()
-    call void @func2()
-    ret void
+; CHECK-LABEL: define amdgpu_kernel void @kernel1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    call void @func1()
+; CHECK-NEXT:    call void @func2()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier()
+  call void @func1()
+  call void @func2()
+  ret void
 }
 
 define amdgpu_kernel void @kernel2() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-LABEL: define amdgpu_kernel void @kernel2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    call void @func2()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
 
-    call void @func2()
-    ret void
+  call void @func2()
+  ret void
 }
 
 declare void @llvm.amdgcn.s.barrier() #1
@@ -61,7 +92,12 @@ declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }
-
-; CHECK: !0 = !{i32 8396816, i32 8396817}
-; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
-; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind }
+;.
+; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
+; CHECK: [[META2]] = !{i32 8396848, i32 8396849}
+;.

From b01131a4b991ba202fcd10d7d7c5699604826af4 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Thu, 30 Oct 2025 22:42:33 +0530
Subject: [PATCH 08/16] [AMDGPU] Enable amdgpu-lower-special-lds pass in
 pipeline

---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 126 ------------------
 llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp  |   6 +
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |   3 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  14 ++
 ...amdgpu-lower-special-lds-and-module-lds.ll | 119 +++++++++++++++++
 .../amdgpu-lower-special-lds-and-sw-lds.ll    |  86 ++++++++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |   6 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   5 +
 .../test/CodeGen/AMDGPU/s-barrier-lowering.ll |   2 +-
 9 files changed, 236 insertions(+), 131 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
     return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
-                                             Function *KF) {
-    bool NeedsReplacement = false;
-    for (Use &U : GV->uses()) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (isKernelLDS(F) && F != KF) {
-          NeedsReplacement = true;
-          break;
-        }
-      }
-    }
-    if (!NeedsReplacement)
-      return GV;
-    // Create a new GV used only by this kernel and its function
-    GlobalVariable *NewGV = new GlobalVariable(
-        M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-        GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-        GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-    NewGV->copyAttributesFrom(GV);
-    for (Use &U : make_early_inc_range(GV->uses())) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (!isKernelLDS(F) || F == KF) {
-          U.getUser()->replaceUsesOfWith(GV, NewGV);
-        }
-      }
-    }
-    return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-      Module &M, LDSUsesInfoTy &LDSUsesInfo,
-      VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-    bool Changed = false;
-    const DataLayout &DL = M.getDataLayout();
-    // The 1st round: give module-absolute assignments
-    int NumAbsolutes = 0;
-    std::vector<GlobalVariable *> OrderedGVs;
-    for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-      GlobalVariable *GV = K.first;
-      if (!isNamedBarrier(*GV))
-        continue;
-      // give a module-absolute assignment if it is indirectly accessed by
-      // multiple kernels. This is not precise, but we don't want to duplicate
-      // a function when it is called by multiple kernels.
-      if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-        OrderedGVs.push_back(GV);
-      } else {
-        // leave it to the 2nd round, which will give a kernel-relative
-        // assignment if it is only indirectly accessed by one kernel
-        LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-      }
-      LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-    }
-    OrderedGVs = sortByName(std::move(OrderedGVs));
-    for (GlobalVariable *GV : OrderedGVs) {
-      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-      unsigned BarId = NumAbsolutes + 1;
-      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-      NumAbsolutes += BarCnt;
-
-      // 4 bits for alignment, 5 bits for the barrier num,
-      // 3 bits for the barrier scope
-      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-      recordLDSAbsoluteAddress(&M, GV, Offset);
-    }
-    OrderedGVs.clear();
-
-    // The 2nd round: give a kernel-relative assignment for GV that
-    // either only indirectly accessed by single kernel or only directly
-    // accessed by multiple kernels.
-    std::vector<Function *> OrderedKernels;
-    for (auto &K : LDSUsesInfo.direct_access) {
-      Function *F = K.first;
-      assert(isKernelLDS(F));
-      OrderedKernels.push_back(F);
-    }
-    OrderedKernels = sortByName(std::move(OrderedKernels));
-
-    llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
-    for (Function *F : OrderedKernels) {
-      for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-        if (!isNamedBarrier(*GV))
-          continue;
-
-        LDSUsesInfo.direct_access[F].erase(GV);
-        if (GV->isAbsoluteSymbolRef()) {
-          // already assigned
-          continue;
-        }
-        OrderedGVs.push_back(GV);
-      }
-      OrderedGVs = sortByName(std::move(OrderedGVs));
-      for (GlobalVariable *GV : OrderedGVs) {
-        // GV could also be used directly by other kernels. If so, we need to
-        // create a new GV used only by this kernel and its function.
-        auto NewGV = uniquifyGVPerKernel(M, GV, F);
-        Changed |= (NewGV != GV);
-        unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-        unsigned BarId = Kernel2BarId[F];
-        BarId += NumAbsolutes + 1;
-        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-        Kernel2BarId[F] += BarCnt;
-        unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-        recordLDSAbsoluteAddress(&M, NewGV, Offset);
-      }
-      OrderedGVs.clear();
-    }
-    // Also erase those special LDS variables from indirect_access.
-    for (auto &K : LDSUsesInfo.indirect_access) {
-      assert(isKernelLDS(K.first));
-      for (GlobalVariable *GV : K.second) {
-        if (isNamedBarrier(*GV))
-          K.second.erase(GV);
-      }
-    }
-    return Changed;
-  }
-
   bool runOnModule(Module &M) {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS {
       }
     }
 
-    if (LDSUsesInfo.HasSpecialGVs) {
-      // Special LDS variables need special address assignment
-      Changed |= lowerSpecialLDSVariables(
-          M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
-    }
-
     // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index e17c2113ca398..f7dff4ba4c5e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -273,6 +273,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
   //      this is a re-run of the pass
   //      so we don't have anything to do.
   //    - No variables are absolute.
+  // Named-barriers which are absolute symbols are removed
+  // from the maps.
   std::optional<bool> HasAbsoluteGVs;
   bool HasSpecialGVs = false;
   for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
@@ -284,6 +286,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) {
         if (IsDirectMapDynLDSGV)
           continue;
         if (isNamedBarrier(*GV)) {
+          if (IsAbsolute) {
+            DirectMapKernel[Fn].erase(GV);
+            IndirectMapKernel[Fn].erase(GV);
+          }
           HasSpecialGVs = true;
           continue;
         }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4a9437b37aa39..827326ae90a75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -293,7 +293,8 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
   for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
     if (!AMDGPU::isLDSVariableToLower(*GV))
       continue;
-
+    if (isNamedBarrier(*GV))
+      continue;
     for (User *V : GV->users()) {
       if (auto *I = dyn_cast<Instruction>(V)) {
         Function *F = I->getFunction();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e360a6db3ad78..85b5775ce91af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -465,6 +465,11 @@ static cl::opt<bool> EnableScalarIRPasses(
   cl::init(true),
   cl::Hidden);
 
+static cl::opt<bool>
+    EnableLowerSpecialLDS("amdgpu-enable-lower-special-lds",
+                          cl::desc("Enable lowering of special lds pass."),
+                          cl::init(true), cl::Hidden);
+
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
                      cl::desc("Enable lowering of lds to global memory pass "
@@ -963,6 +968,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.
+        if (EnableLowerSpecialLDS)
+          PM.addPass(AMDGPULowerSpecialLDSPass());
         if (EnableSwLowerLDS)
           PM.addPass(AMDGPUSwLowerLDSPass(*this));
         if (EnableLowerModuleLDS)
@@ -1334,6 +1341,10 @@ void AMDGPUPassConfig::addIRPasses() {
   // Make enqueued block runtime handles externally visible.
   addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass());
 
+  // Lower special LDS accesses.
+  if (EnableLowerSpecialLDS)
+    addPass(createAMDGPULowerSpecialLDSLegacyPass());
+
   // Lower LDS accesses to global memory pass if address sanitizer is enabled.
   if (EnableSwLowerLDS)
     addPass(createAMDGPUSwLowerLDSLegacyPass(&TM));
@@ -2081,6 +2092,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
 
   addPass(AMDGPUExportKernelRuntimeHandlesPass());
 
+  if (EnableLowerSpecialLDS)
+    addPass(AMDGPULowerSpecialLDSPass());
+
   if (EnableSwLowerLDS)
     addPass(AMDGPUSwLowerLDSPass(TM));
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
new file mode 100644
index 0000000000000..73cde6405ae1f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
+
+;.
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]]
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+;.
+define void @func1() #0 {
+; CHECK-LABEL: define void @func1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  ret void
+}
+
+define void @func2() #0 {
+; CHECK-LABEL: define void @func2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  store i8 7, ptr addrspace(3) @lds1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    call void @func1()
+; CHECK-NEXT:    call void @func2()
+; CHECK-NEXT:    store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier()
+  call void @func1()
+  call void @func2()
+  store i8 9, ptr addrspace(3) @lds1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel2(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK-NEXT:    call void @func2()
+; CHECK-NEXT:    store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  call void @func2()
+  store i8 10, ptr addrspace(3) @lds1, align 4
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind }
+; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-lds-size"="1" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
+; CHECK: [[META2]] = !{i32 8396848, i32 8396849}
+; CHECK: [[META3]] = !{i32 0, i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
new file mode 100644
index 0000000000000..3127f1feac230
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
@@ -0,0 +1,86 @@
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
+
+;.
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.barkernel = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+;
+define void @foo() #0 {
+; CHECK-LABEL: define void @foo(
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  ret void
+}
+
+define void @bar() #0 {
+; CHECK-LABEL: define void @bar(
+; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  store i8 7, ptr addrspace(3) @lds1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fookernel() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @fookernel(
+; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+; CHECK:    call void @llvm.amdgcn.s.barrier()
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier()
+  call void @foo()
+  call void @bar()
+  store i8 9, ptr addrspace(3) @lds1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @barkernel() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @barkernel(
+; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.barkernel, i32 9)
+; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.barkernel)
+; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+  call void @llvm.amdgcn.s.barrier.wait(i16 1)
+  call void @bar()
+  store i8 10, ptr addrspace(3) @lds1, align 4
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind sanitize_address }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"nosanitize_address", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 8e7389ace9c5c..69dfbc2a2ae51 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -9,11 +9,11 @@
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index ee6caab6f25cd..7e5b9a22f0352 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -44,6 +44,7 @@
 ; GCN-O0-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O0-NEXT:        Function Alias Analysis Results
 ; GCN-O0-NEXT:    Externalize enqueued block runtime handles
+; GCN-O0-NEXT:    AMDGPU lowering of special LDS variables
 ; GCN-O0-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:    FunctionPass Manager
@@ -197,6 +198,7 @@
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:    Externalize enqueued block runtime handles
+; GCN-O1-NEXT:    AMDGPU lowering of special LDS variables
 ; GCN-O1-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-NEXT:    FunctionPass Manager
@@ -489,6 +491,7 @@
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:    Externalize enqueued block runtime handles
+; GCN-O1-OPTS-NEXT:    AMDGPU lowering of special LDS variables
 ; GCN-O1-OPTS-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-OPTS-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
@@ -810,6 +813,7 @@
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:    Externalize enqueued block runtime handles
+; GCN-O2-NEXT:    AMDGPU lowering of special LDS variables
 ; GCN-O2-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O2-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O2-NEXT:    FunctionPass Manager
@@ -1135,6 +1139,7 @@
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:    Externalize enqueued block runtime handles
+; GCN-O3-NEXT:    AMDGPU lowering of special LDS variables
 ; GCN-O3-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O3-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O3-NEXT:    FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 03a666fbe3aea..4fd728dfc9191 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }

From 926080b283cd0843fa2a4fb7861b86fe105c6be9 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Mon, 3 Nov 2025 14:37:36 +0530
Subject: [PATCH 09/16] Fix tests

---
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |  5 +-
 ...amdgpu-lower-special-lds-and-module-lds.ll |  3 +
 .../amdgpu-lower-special-lds-and-sw-lds.ll    | 57 +++++++------------
 3 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 827326ae90a75..3591c3c335338 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -291,9 +291,8 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
 
 void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
   for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
-    if (!AMDGPU::isLDSVariableToLower(*GV))
-      continue;
-    if (isNamedBarrier(*GV))
+    // named-barrier globals are lowered by amdgpu-lower-special-lds pass.
+    if (!AMDGPU::isLDSVariableToLower(*GV) || isNamedBarrier(*GV))
       continue;
     for (User *V : GV->users()) {
       if (auto *I = dyn_cast<Instruction>(V)) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
index 73cde6405ae1f..1ddbaf8b5d94d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
@@ -2,6 +2,9 @@
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
 ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s
 
+; Test to ensure that special LDS variables like named barriers are lowered correctly,
+; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-special-lds pass.
+
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
 @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
index 3127f1feac230..a185249488cdb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
@@ -1,34 +1,25 @@
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s
 ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
 
+; Test to ensure that special LDS variables like named barriers are lowered correctly in asan scenario,
+; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-special-lds pass.
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
-@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
 @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
 @lds1 = internal addrspace(3) global [1 x i8] poison, align 4
 
 ;.
 ; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
-; CHECK: @bar1.barkernel = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
 ;
-define void @foo() #0 {
-; CHECK-LABEL: define void @foo(
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-  call void @llvm.amdgcn.s.barrier.wait(i16 1)
-  ret void
-}
-
 define void @bar() #0 {
 ; CHECK-LABEL: define void @bar(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
 ; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
 ; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK:    store i8 7, ptr addrspace(1) {{.*}}, align 4
+;
   call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
   call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
@@ -36,29 +27,18 @@ define void @bar() #0 {
   ret void
 }
 
-define amdgpu_kernel void @fookernel() #0 {
-; CHECK-LABEL: define amdgpu_kernel void @fookernel(
-; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
-; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
-; CHECK:    call void @llvm.amdgcn.s.barrier()
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-  call void @llvm.amdgcn.s.barrier.wait(i16 1)
-  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
-  call void @llvm.amdgcn.s.barrier()
-  call void @foo()
-  call void @bar()
-  store i8 9, ptr addrspace(3) @lds1, align 4
-  ret void
-}
-
 define amdgpu_kernel void @barkernel() #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @barkernel(
-; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.barkernel, i32 9)
-; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.barkernel)
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
+; CHECK:    {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}})
+; CHECK:    call void @llvm.amdgcn.s.barrier()
+; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
 ; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+; CHECK:    call void @bar()
+; CHECK:    store i8 10, ptr addrspace(1) {{.*}}, align 4
+; CHECK:    call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}})
+;
   call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
   call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
@@ -84,3 +64,10 @@ attributes #2 = { nounwind readnone }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 4, !"nosanitize_address", i32 1}
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address }
+; CHECK: attributes #[[ATTR1]] = { nounwind sanitize_address "amdgpu-lds-size"="8" }
+;.
+; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
+; CHECK: [[META1]] = !{i32 8396816, i32 8396817}
+;.

From 17791e8610f393c54e77d501f2c146374fa5bf90 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Wed, 5 Nov 2025 21:28:58 +0530
Subject: [PATCH 10/16] update names

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 18 +++++++++---------
 ...> amdgpu-lower-exec-sync-and-module-lds.ll} |  6 +++---
 ...ll => amdgpu-lower-exec-sync-and-sw-lds.ll} |  6 +++---
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll       | 10 +++++-----
 llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll |  2 +-
 5 files changed, 21 insertions(+), 21 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{amdgpu-lower-special-lds-and-module-lds.ll => amdgpu-lower-exec-sync-and-module-lds.ll} (96%)
 rename llvm/test/CodeGen/AMDGPU/{amdgpu-lower-special-lds-and-sw-lds.ll => amdgpu-lower-exec-sync-and-sw-lds.ll} (92%)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 85b5775ce91af..0890b82eb45ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -466,9 +466,9 @@ static cl::opt<bool> EnableScalarIRPasses(
   cl::Hidden);
 
 static cl::opt<bool>
-    EnableLowerSpecialLDS("amdgpu-enable-lower-special-lds",
-                          cl::desc("Enable lowering of special lds pass."),
-                          cl::init(true), cl::Hidden);
+    EnableLowerExecSync("amdgpu-enable-lower-exec-sync",
+                        cl::desc("Enable lowering of exec sync pass."),
+                        cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
@@ -968,8 +968,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.
-        if (EnableLowerSpecialLDS)
-          PM.addPass(AMDGPULowerSpecialLDSPass());
+        if (EnableLowerExecSync)
+          PM.addPass(AMDGPULowerExecSyncPass());
         if (EnableSwLowerLDS)
           PM.addPass(AMDGPUSwLowerLDSPass(*this));
         if (EnableLowerModuleLDS)
@@ -1342,8 +1342,8 @@ void AMDGPUPassConfig::addIRPasses() {
   addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass());
 
   // Lower special LDS accesses.
-  if (EnableLowerSpecialLDS)
-    addPass(createAMDGPULowerSpecialLDSLegacyPass());
+  if (EnableLowerExecSync)
+    addPass(createAMDGPULowerExecSyncLegacyPass());
 
   // Lower LDS accesses to global memory pass if address sanitizer is enabled.
   if (EnableSwLowerLDS)
@@ -2092,8 +2092,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
 
   addPass(AMDGPUExportKernelRuntimeHandlesPass());
 
-  if (EnableLowerSpecialLDS)
-    addPass(AMDGPULowerSpecialLDSPass());
+  if (EnableLowerExecSync)
+    addPass(AMDGPULowerExecSyncPass());
 
   if (EnableSwLowerLDS)
     addPass(AMDGPUSwLowerLDSPass(TM));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
similarity index 96%
rename from llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
rename to llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
index 1ddbaf8b5d94d..bed8fa20a5044 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
-; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
 ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s
 
-; Test to ensure that special LDS variables like named barriers are lowered correctly,
-; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-special-lds pass.
+; Test to ensure that LDS variables like named barriers are lowered correctly,
+; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
similarity index 92%
rename from llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
rename to llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
index a185249488cdb..05f2f07c84503 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
@@ -1,8 +1,8 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s
 ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s
 
-; Test to ensure that special LDS variables like named barriers are lowered correctly in asan scenario,
-; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-special-lds pass.
+; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario,
+; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
 @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 7e5b9a22f0352..a366e9508fc1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -44,7 +44,7 @@
 ; GCN-O0-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O0-NEXT:        Function Alias Analysis Results
 ; GCN-O0-NEXT:    Externalize enqueued block runtime handles
-; GCN-O0-NEXT:    AMDGPU lowering of special LDS variables
+; GCN-O0-NEXT:    AMDGPU lowering of execution synchronization globals
 ; GCN-O0-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:    FunctionPass Manager
@@ -198,7 +198,7 @@
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:    Externalize enqueued block runtime handles
-; GCN-O1-NEXT:    AMDGPU lowering of special LDS variables
+; GCN-O1-NEXT:    AMDGPU lowering of execution synchronization globals
 ; GCN-O1-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-NEXT:    FunctionPass Manager
@@ -491,7 +491,7 @@
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:    Externalize enqueued block runtime handles
-; GCN-O1-OPTS-NEXT:    AMDGPU lowering of special LDS variables
+; GCN-O1-OPTS-NEXT:    AMDGPU lowering of execution synchronization globals
 ; GCN-O1-OPTS-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-OPTS-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
@@ -813,7 +813,7 @@
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:    Externalize enqueued block runtime handles
-; GCN-O2-NEXT:    AMDGPU lowering of special LDS variables
+; GCN-O2-NEXT:    AMDGPU lowering of execution synchronization globals
 ; GCN-O2-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O2-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O2-NEXT:    FunctionPass Manager
@@ -1139,7 +1139,7 @@
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:    Externalize enqueued block runtime handles
-; GCN-O3-NEXT:    AMDGPU lowering of special LDS variables
+; GCN-O3-NEXT:    AMDGPU lowering of execution synchronization globals
 ; GCN-O3-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O3-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O3-NEXT:    FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 4fd728dfc9191..9f3dfb01282bc 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }

From 71d07c39a139da828c8fccd82af5ac60b83572f6 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Thu, 6 Nov 2025 12:50:33 +0530
Subject: [PATCH 11/16] remove changes from prior LDS lowerin passes

---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 126 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |   4 +-
 2 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 3c0328e93ffbd..a4ef524c43466 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,6 +922,126 @@ class AMDGPULowerModuleLDS {
     return KernelToCreatedDynamicLDS;
   }
 
+  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                             Function *KF) {
+    bool NeedsReplacement = false;
+    for (Use &U : GV->uses()) {
+      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+        Function *F = I->getFunction();
+        if (isKernelLDS(F) && F != KF) {
+          NeedsReplacement = true;
+          break;
+        }
+      }
+    }
+    if (!NeedsReplacement)
+      return GV;
+    // Create a new GV used only by this kernel and its function
+    GlobalVariable *NewGV = new GlobalVariable(
+        M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+        GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+        GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+    NewGV->copyAttributesFrom(GV);
+    for (Use &U : make_early_inc_range(GV->uses())) {
+      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+        Function *F = I->getFunction();
+        if (!isKernelLDS(F) || F == KF) {
+          U.getUser()->replaceUsesOfWith(GV, NewGV);
+        }
+      }
+    }
+    return NewGV;
+  }
+
+  bool lowerSpecialLDSVariables(
+      Module &M, LDSUsesInfoTy &LDSUsesInfo,
+      VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+    bool Changed = false;
+    const DataLayout &DL = M.getDataLayout();
+    // The 1st round: give module-absolute assignments
+    int NumAbsolutes = 0;
+    std::vector<GlobalVariable *> OrderedGVs;
+    for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+      GlobalVariable *GV = K.first;
+      if (!isNamedBarrier(*GV))
+        continue;
+      // give a module-absolute assignment if it is indirectly accessed by
+      // multiple kernels. This is not precise, but we don't want to duplicate
+      // a function when it is called by multiple kernels.
+      if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+        OrderedGVs.push_back(GV);
+      } else {
+        // leave it to the 2nd round, which will give a kernel-relative
+        // assignment if it is only indirectly accessed by one kernel
+        LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+      }
+      LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+    }
+    OrderedGVs = sortByName(std::move(OrderedGVs));
+    for (GlobalVariable *GV : OrderedGVs) {
+      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarId = NumAbsolutes + 1;
+      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+      NumAbsolutes += BarCnt;
+
+      // 4 bits for alignment, 5 bits for the barrier num,
+      // 3 bits for the barrier scope
+      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+      recordLDSAbsoluteAddress(&M, GV, Offset);
+    }
+    OrderedGVs.clear();
+
+    // The 2nd round: give a kernel-relative assignment for GV that
+    // either only indirectly accessed by single kernel or only directly
+    // accessed by multiple kernels.
+    std::vector<Function *> OrderedKernels;
+    for (auto &K : LDSUsesInfo.direct_access) {
+      Function *F = K.first;
+      assert(isKernelLDS(F));
+      OrderedKernels.push_back(F);
+    }
+    OrderedKernels = sortByName(std::move(OrderedKernels));
+
+    llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+    for (Function *F : OrderedKernels) {
+      for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+        if (!isNamedBarrier(*GV))
+          continue;
+
+        LDSUsesInfo.direct_access[F].erase(GV);
+        if (GV->isAbsoluteSymbolRef()) {
+          // already assigned
+          continue;
+        }
+        OrderedGVs.push_back(GV);
+      }
+      OrderedGVs = sortByName(std::move(OrderedGVs));
+      for (GlobalVariable *GV : OrderedGVs) {
+        // GV could also be used directly by other kernels. If so, we need to
+        // create a new GV used only by this kernel and its function.
+        auto NewGV = uniquifyGVPerKernel(M, GV, F);
+        Changed |= (NewGV != GV);
+        unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+        unsigned BarId = Kernel2BarId[F];
+        BarId += NumAbsolutes + 1;
+        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+        Kernel2BarId[F] += BarCnt;
+        unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+        recordLDSAbsoluteAddress(&M, NewGV, Offset);
+      }
+      OrderedGVs.clear();
+    }
+    // Also erase those special LDS variables from indirect_access.
+    for (auto &K : LDSUsesInfo.indirect_access) {
+      assert(isKernelLDS(K.first));
+      for (GlobalVariable *GV : K.second) {
+        if (isNamedBarrier(*GV))
+          K.second.erase(GV);
+      }
+    }
+    return Changed;
+  }
+
   bool runOnModule(Module &M) {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -944,6 +1064,12 @@ class AMDGPULowerModuleLDS {
       }
     }
 
+    if (LDSUsesInfo.HasSpecialGVs) {
+      // Special LDS variables need special address assignment
+      Changed |= lowerSpecialLDSVariables(
+          M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+    }
+
     // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 3591c3c335338..4a9437b37aa39 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -291,9 +291,9 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
 
 void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
   for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
-    // named-barrier globals are lowered by amdgpu-lower-special-lds pass.
-    if (!AMDGPU::isLDSVariableToLower(*GV) || isNamedBarrier(*GV))
+    if (!AMDGPU::isLDSVariableToLower(*GV))
       continue;
+
     for (User *V : GV->users()) {
       if (auto *I = dyn_cast<Instruction>(V)) {
         Function *F = I->getFunction();

From 84594f2429f1c65fe4f28d6bb93f48d9309ddaef Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Sun, 9 Nov 2025 10:59:11 +0530
Subject: [PATCH 12/16] Update amdgpu-lower-exec-sync.ll test with llc RUN line

---
 llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
index 782d94845a358..bde6db6463cb1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s
+; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-exec-sync -mtriple=amdgcn-amd-amdhsa | FileCheck %s
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 

From 862d70282db157c65b6a927c6128696c443ac946 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Thu, 6 Nov 2025 14:29:17 +0530
Subject: [PATCH 13/16] [AMDGPU] Remove lowering named-barrier LDS logci from
 amdgpu-lower-module-lds

---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 126 ------------------
 1 file changed, 126 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a4ef524c43466..3c0328e93ffbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
     return KernelToCreatedDynamicLDS;
   }
 
-  static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
-                                             Function *KF) {
-    bool NeedsReplacement = false;
-    for (Use &U : GV->uses()) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (isKernelLDS(F) && F != KF) {
-          NeedsReplacement = true;
-          break;
-        }
-      }
-    }
-    if (!NeedsReplacement)
-      return GV;
-    // Create a new GV used only by this kernel and its function
-    GlobalVariable *NewGV = new GlobalVariable(
-        M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
-        GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
-        GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
-    NewGV->copyAttributesFrom(GV);
-    for (Use &U : make_early_inc_range(GV->uses())) {
-      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        Function *F = I->getFunction();
-        if (!isKernelLDS(F) || F == KF) {
-          U.getUser()->replaceUsesOfWith(GV, NewGV);
-        }
-      }
-    }
-    return NewGV;
-  }
-
-  bool lowerSpecialLDSVariables(
-      Module &M, LDSUsesInfoTy &LDSUsesInfo,
-      VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
-    bool Changed = false;
-    const DataLayout &DL = M.getDataLayout();
-    // The 1st round: give module-absolute assignments
-    int NumAbsolutes = 0;
-    std::vector<GlobalVariable *> OrderedGVs;
-    for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
-      GlobalVariable *GV = K.first;
-      if (!isNamedBarrier(*GV))
-        continue;
-      // give a module-absolute assignment if it is indirectly accessed by
-      // multiple kernels. This is not precise, but we don't want to duplicate
-      // a function when it is called by multiple kernels.
-      if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
-        OrderedGVs.push_back(GV);
-      } else {
-        // leave it to the 2nd round, which will give a kernel-relative
-        // assignment if it is only indirectly accessed by one kernel
-        LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
-      }
-      LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
-    }
-    OrderedGVs = sortByName(std::move(OrderedGVs));
-    for (GlobalVariable *GV : OrderedGVs) {
-      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-      unsigned BarId = NumAbsolutes + 1;
-      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-      NumAbsolutes += BarCnt;
-
-      // 4 bits for alignment, 5 bits for the barrier num,
-      // 3 bits for the barrier scope
-      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-      recordLDSAbsoluteAddress(&M, GV, Offset);
-    }
-    OrderedGVs.clear();
-
-    // The 2nd round: give a kernel-relative assignment for GV that
-    // either only indirectly accessed by single kernel or only directly
-    // accessed by multiple kernels.
-    std::vector<Function *> OrderedKernels;
-    for (auto &K : LDSUsesInfo.direct_access) {
-      Function *F = K.first;
-      assert(isKernelLDS(F));
-      OrderedKernels.push_back(F);
-    }
-    OrderedKernels = sortByName(std::move(OrderedKernels));
-
-    llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
-    for (Function *F : OrderedKernels) {
-      for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
-        if (!isNamedBarrier(*GV))
-          continue;
-
-        LDSUsesInfo.direct_access[F].erase(GV);
-        if (GV->isAbsoluteSymbolRef()) {
-          // already assigned
-          continue;
-        }
-        OrderedGVs.push_back(GV);
-      }
-      OrderedGVs = sortByName(std::move(OrderedGVs));
-      for (GlobalVariable *GV : OrderedGVs) {
-        // GV could also be used directly by other kernels. If so, we need to
-        // create a new GV used only by this kernel and its function.
-        auto NewGV = uniquifyGVPerKernel(M, GV, F);
-        Changed |= (NewGV != GV);
-        unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
-        unsigned BarId = Kernel2BarId[F];
-        BarId += NumAbsolutes + 1;
-        unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
-        Kernel2BarId[F] += BarCnt;
-        unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
-        recordLDSAbsoluteAddress(&M, NewGV, Offset);
-      }
-      OrderedGVs.clear();
-    }
-    // Also erase those special LDS variables from indirect_access.
-    for (auto &K : LDSUsesInfo.indirect_access) {
-      assert(isKernelLDS(K.first));
-      for (GlobalVariable *GV : K.second) {
-        if (isNamedBarrier(*GV))
-          K.second.erase(GV);
-      }
-    }
-    return Changed;
-  }
-
   bool runOnModule(Module &M) {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS {
       }
     }
 
-    if (LDSUsesInfo.HasSpecialGVs) {
-      // Special LDS variables need special address assignment
-      Changed |= lowerSpecialLDSVariables(
-          M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
-    }
-
     // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;

From ce406c280bf3d73d27d39f9b361c6a92a0c7b02e Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Wed, 12 Nov 2025 10:34:23 +0530
Subject: [PATCH 14/16] Update description of pass

---
 llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp |  4 ++--
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  8 ++++----
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll       | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index 3343d594f47ad..f939eded29102 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -219,11 +219,11 @@ char AMDGPULowerExecSyncLegacy::ID = 0;
 char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
 
 INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                      "AMDGPU lowering of execution synchronization globals",
+                      "AMDGPU lowering of execution synchronization primitives",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                    "AMDGPU lowering of execution synchronization globals",
+                    "AMDGPU lowering of execution synchronization primitives",
                     false, false)
 
 bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0890b82eb45ed..095e34b86b08f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -465,10 +465,10 @@ static cl::opt<bool> EnableScalarIRPasses(
   cl::init(true),
   cl::Hidden);
 
-static cl::opt<bool>
-    EnableLowerExecSync("amdgpu-enable-lower-exec-sync",
-                        cl::desc("Enable lowering of exec sync pass."),
-                        cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLowerExecSync(
+    "amdgpu-enable-lower-exec-sync",
+    cl::desc("Enable lowering of execution synchronization primitives."),
+    cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index a366e9508fc1c..8f1335ab87c85 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -44,7 +44,7 @@
 ; GCN-O0-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O0-NEXT:        Function Alias Analysis Results
 ; GCN-O0-NEXT:    Externalize enqueued block runtime handles
-; GCN-O0-NEXT:    AMDGPU lowering of execution synchronization globals
+; GCN-O0-NEXT:    AMDGPU lowering of execution synchronization primitives
 ; GCN-O0-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:    FunctionPass Manager
@@ -198,7 +198,7 @@
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:    Externalize enqueued block runtime handles
-; GCN-O1-NEXT:    AMDGPU lowering of execution synchronization globals
+; GCN-O1-NEXT:    AMDGPU lowering of execution synchronization primitives
 ; GCN-O1-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-NEXT:    FunctionPass Manager
@@ -491,7 +491,7 @@
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:    Externalize enqueued block runtime handles
-; GCN-O1-OPTS-NEXT:    AMDGPU lowering of execution synchronization globals
+; GCN-O1-OPTS-NEXT:    AMDGPU lowering of execution synchronization primitives
 ; GCN-O1-OPTS-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-OPTS-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
@@ -813,7 +813,7 @@
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:    Externalize enqueued block runtime handles
-; GCN-O2-NEXT:    AMDGPU lowering of execution synchronization globals
+; GCN-O2-NEXT:    AMDGPU lowering of execution synchronization primitives
 ; GCN-O2-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O2-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O2-NEXT:    FunctionPass Manager
@@ -1139,7 +1139,7 @@
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:    Externalize enqueued block runtime handles
-; GCN-O3-NEXT:    AMDGPU lowering of execution synchronization globals
+; GCN-O3-NEXT:    AMDGPU lowering of execution synchronization primitives
 ; GCN-O3-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O3-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O3-NEXT:    FunctionPass Manager

From 26199e238f014c7cbfc39e7a76a0374f6cb3109f Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Wed, 12 Nov 2025 14:49:49 +0530
Subject: [PATCH 15/16] Use execution synchronization everywhere

---
 llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 12 ++++++------
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  4 ++--
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll       | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index f939eded29102..2938592164f0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPULowerExecSync.cpp -----------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// AMDGPU Lower Execution Synchronization pass performs lowering of
+// Lower Execution Synchronization pass performs lowering of
 // LDS global variables with target extension type "amdgpu.named.barrier"
 // that require specialized address assignment. It assigns a unique
 // barrier identifier to each named-barrier LDS variable and encodes
@@ -219,12 +219,12 @@ char AMDGPULowerExecSyncLegacy::ID = 0;
 char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
 
 INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                      "AMDGPU lowering of execution synchronization primitives",
-                      false, false)
+                      "AMDGPU lowering of execution synchronization", false,
+                      false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
-                    "AMDGPU lowering of execution synchronization primitives",
-                    false, false)
+                    "AMDGPU lowering of execution synchronization", false,
+                    false)
 
 bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
   return runLowerExecSyncGlobals(M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 095e34b86b08f..5ff16e29bbbb1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -467,8 +467,8 @@ static cl::opt<bool> EnableScalarIRPasses(
 
 static cl::opt<bool> EnableLowerExecSync(
     "amdgpu-enable-lower-exec-sync",
-    cl::desc("Enable lowering of execution synchronization primitives."),
-    cl::init(true), cl::Hidden);
+    cl::desc("Enable lowering of execution synchronization."), cl::init(true),
+    cl::Hidden);
 
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 8f1335ab87c85..fe75b2b5bfcf5 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -44,7 +44,7 @@
 ; GCN-O0-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O0-NEXT:        Function Alias Analysis Results
 ; GCN-O0-NEXT:    Externalize enqueued block runtime handles
-; GCN-O0-NEXT:    AMDGPU lowering of execution synchronization primitives
+; GCN-O0-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O0-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:    FunctionPass Manager
@@ -198,7 +198,7 @@
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:    Externalize enqueued block runtime handles
-; GCN-O1-NEXT:    AMDGPU lowering of execution synchronization primitives
+; GCN-O1-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O1-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-NEXT:    FunctionPass Manager
@@ -491,7 +491,7 @@
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:    Externalize enqueued block runtime handles
-; GCN-O1-OPTS-NEXT:    AMDGPU lowering of execution synchronization primitives
+; GCN-O1-OPTS-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O1-OPTS-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-OPTS-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
@@ -813,7 +813,7 @@
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:    Externalize enqueued block runtime handles
-; GCN-O2-NEXT:    AMDGPU lowering of execution synchronization primitives
+; GCN-O2-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O2-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O2-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O2-NEXT:    FunctionPass Manager
@@ -1139,7 +1139,7 @@
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:    Externalize enqueued block runtime handles
-; GCN-O3-NEXT:    AMDGPU lowering of execution synchronization primitives
+; GCN-O3-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O3-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O3-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O3-NEXT:    FunctionPass Manager

From d9979cfd1469aaf8f3af9e58dfa1488a155ef862 Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa@amd.com>
Date: Thu, 13 Nov 2025 21:01:47 +0530
Subject: [PATCH 16/16] Fix pass description

---
 llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index 2938592164f0a..89f6b38df9d56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Lower Execution Synchronization pass performs lowering of
-// LDS global variables with target extension type "amdgpu.named.barrier"
+// Lower LDS global variables with target extension type "amdgpu.named.barrier"
 // that require specialized address assignment. It assigns a unique
 // barrier identifier to each named-barrier LDS variable and encodes
 // this identifier within the !absolute_symbol metadata of that global.