[AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals #165692

skc7 · 2025-10-30T10:54:33Z

This PR introduces amdgpu-lower-exec-sync pass which specifically lowers named-barrier LDS globals introduced by #114550 .

Changes include:

Moving the logic of lowering named-barrier LDS globals from amdgpu-lower-module-lds pass to this new pass.
This PR adds the pass, remove the existing lowering logic for named-barrier LDS in amdgpu-lower-module-lds

See #161827 for discussion on this topic.

llvmbot · 2025-11-03T09:16:10Z

@llvm/pr-subscribers-backend-amdgpu

Author: Chaitanya (skc7)

Changes

This PR introduces amdgpu-lower-special-lds pass which specifically lowers named-barrier LDS globals introduced by #114550 .

Changes include:

Moving the logic of lowering named-barrier LDS globals from amdgpu-lower-module-lds pass to this new pass.
#165746 adds the "amdgpu-lower-special-lds" pass to pipeline, which runs prior to LDS lowering passes.

See #161827 for discussion on this topic.

Full diff: https://github.com/llvm/llvm-project/pull/165692.diff

6 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+9)
(added) llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp (+234)
(modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+1)
(modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
(added) llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll (+67)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cd8b2495a4250..d878cbfce07f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
   bool GlobalOpt;
 };
 
+void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &);
+extern char &AMDGPULowerSpecialLDSLegacyPassID;
+ModulePass *createAMDGPULowerSpecialLDSLegacyPass();
+
+struct AMDGPULowerSpecialLDSPass : PassInfoMixin<AMDGPULowerSpecialLDSPass> {
+  AMDGPULowerSpecialLDSPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
 extern char &AMDGPUSwLowerLDSLegacyPassID;
 ModulePass *
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
new file mode 100644
index 0000000000000..5534a3ba6382e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -0,0 +1,234 @@
+//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the named barriers LDS globals which needs
+// special address assignment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-special-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                           Function *KF) {
+  bool NeedsReplacement = false;
+  for (Use &U : GV->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (isKernelLDS(F) && F != KF) {
+        NeedsReplacement = true;
+        break;
+      }
+    }
+  }
+  if (!NeedsReplacement)
+    return GV;
+  // Create a new GV used only by this kernel and its function
+  GlobalVariable *NewGV = new GlobalVariable(
+      M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+      GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+      GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  for (Use &U : make_early_inc_range(GV->uses())) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (!isKernelLDS(F) || F == KF) {
+        U.getUser()->replaceUsesOfWith(GV, NewGV);
+      }
+    }
+  }
+  return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+                                     uint32_t Address) {
+  LLVMContext &Ctx = M->getContext();
+  auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+  auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+  auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+  GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                  MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+  llvm::sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();
+  });
+  return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerSpecialLDSVariables(
+    Module &M, LDSUsesInfoTy &LDSUsesInfo,
+    VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+  // The 1st round: give module-absolute assignments
+  int NumAbsolutes = 0;
+  std::vector<GlobalVariable *> OrderedGVs;
+  for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+    GlobalVariable *GV = K.first;
+    if (!isNamedBarrier(*GV))
+      continue;
+    // give a module-absolute assignment if it is indirectly accessed by
+    // multiple kernels. This is not precise, but we don't want to duplicate
+    // a function when it is called by multiple kernels.
+    if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+      OrderedGVs.push_back(GV);
+    } else {
+      // leave it to the 2nd round, which will give a kernel-relative
+      // assignment if it is only indirectly accessed by one kernel
+      LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+    }
+    LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+  }
+  OrderedGVs = sortByName(std::move(OrderedGVs));
+  for (GlobalVariable *GV : OrderedGVs) {
+    unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+    unsigned BarId = NumAbsolutes + 1;
+    unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+    NumAbsolutes += BarCnt;
+
+    // 4 bits for alignment, 5 bits for the barrier num,
+    // 3 bits for the barrier scope
+    unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+    recordLDSAbsoluteAddress(&M, GV, Offset);
+  }
+  OrderedGVs.clear();
+
+  // The 2nd round: give a kernel-relative assignment for GV that
+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  std::vector<Function *> OrderedKernels;
+  for (auto &K : LDSUsesInfo.direct_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    OrderedKernels.push_back(F);
+  }
+  OrderedKernels = sortByName(std::move(OrderedKernels));
+
+  llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+  for (Function *F : OrderedKernels) {
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+      if (!isNamedBarrier(*GV))
+        continue;
+
+      LDSUsesInfo.direct_access[F].erase(GV);
+      if (GV->isAbsoluteSymbolRef()) {
+        // already assigned
+        continue;
+      }
+      OrderedGVs.push_back(GV);
+    }
+    OrderedGVs = sortByName(std::move(OrderedGVs));
+    for (GlobalVariable *GV : OrderedGVs) {
+      // GV could also be used directly by other kernels. If so, we need to
+      // create a new GV used only by this kernel and its function.
+      auto NewGV = uniquifyGVPerKernel(M, GV, F);
+      Changed |= (NewGV != GV);
+      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarId = Kernel2BarId[F];
+      BarId += NumAbsolutes + 1;
+      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+      Kernel2BarId[F] += BarCnt;
+      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+      recordLDSAbsoluteAddress(&M, NewGV, Offset);
+    }
+    OrderedGVs.clear();
+  }
+  // Also erase those special LDS variables from indirect_access.
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    assert(isKernelLDS(K.first));
+    for (GlobalVariable *GV : K.second) {
+      if (isNamedBarrier(*GV))
+        K.second.erase(GV);
+    }
+  }
+  return Changed;
+}
+
+static bool runLowerSpecialLDS(Module &M) {
+  CallGraph CG = CallGraph(M);
+  bool Changed = false;
+  Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+  // For each kernel, what variables does it access directly or through
+  // callees
+  LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+  // For each variable accessed through callees, which kernels access it
+  VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    for (GlobalVariable *GV : K.second) {
+      LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+    }
+  }
+
+  if (LDSUsesInfo.HasSpecialGVs) {
+    // Special LDS variables need special address assignment
+    Changed |= lowerSpecialLDSVariables(
+        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+  }
+  return Changed;
+}
+
+class AMDGPULowerSpecialLDSLegacy : public ModulePass {
+public:
+  static char ID;
+  AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerSpecialLDSLegacy::ID = 0;
+char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+                      "AMDGPU lowering of special LDS variables", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+                    "AMDGPU lowering of special LDS variables", false, false)
+
+bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) {
+  return runLowerSpecialLDS(M);
+}
+
+ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() {
+  return new AMDGPULowerSpecialLDSLegacy();
+}
+
+PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  return runLowerSpecialLDS(M) ? PreservedAnalyses::none()
+                               : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index bf6f1a9dbf576..a2fd53ac1b8ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-perf-hint",
 MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass())
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 #undef MODULE_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 75a94ac891819..916826ea169aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -567,6 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILoadStoreOptimizerLegacyPass(*PR);
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
+  initializeAMDGPULowerSpecialLDSLegacyPass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e5293c706..c401926e22a87 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
   AMDGPUPrepareAGPRAlloc.cpp
+  AMDGPULowerSpecialLDS.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
new file mode 100644
index 0000000000000..28d94f3d42622
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+
+@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
+; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+
+define void @func1() {
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define void @func2() {
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier()
+    call void @func1()
+    call void @func2()
+    ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+    call void @func2()
+    ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+; CHECK: !0 = !{i32 8396816, i32 8396817}
+; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
+; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}

Pierre-vh

I don't see the changes that remove the logic from LowerModuleLDS? Are they missing?

EDIT: Ah I see, they're in the next PR that enables the pass.
Please update the PR description to reflect that

Pierre-vh · 2025-11-04T09:15:23Z

llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp

@@ -0,0 +1,234 @@
+//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//


Maybe wait for other reviewers to give feedback first, but I think we should name this "LowerBarrierGVs" instead.

I really don't want to see more "special LDS variables" proliferate. Making the pass specific to the barrier GV would help to not encourage others from adding another "special LDS not not really LDS" GV.

I agree we don't want to say "special", but "barrier" is also not accurate because we will soon need to lower semaphores as well.

"amdgpu-lower-sync-lds" Could be used?

Sync here relates to synchronization primitives.

Perhaps "AMDGPULowerExecSync" ("lower execution synchronization") ?

I assume if we want to do other operations on barriers in the future, like add fallback lowerings, or fixup/optimize intrinsics, we'd do it there as well (because it's the path of least resistance, but also it makes sense to deal with all of it in one place instead of spreading it). I would still like to see the GV be removed from the LDS AS so I wouldn't make this LDS specific either.

"AMDGPULowerSyncPrimitives" sounds better ... "ExecSync" induces furious head scratching trying to understand what it could refer to.

Pierre-vh · 2025-11-04T09:16:21Z

llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp

+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the named barriers LDS globals which needs


Elaborate a bit more, add a little code snippet with before/after. e.g.: This assigns a unique barrier ID to every !amdgpu.named.barrier GV, and encodes the barrier ID in the !absolute_symbol metadata of the GV so LDS lowering pass can safely handle it.

Updated. Thanks for feedback.

Pierre-vh · 2025-11-04T09:17:47Z

llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp

+  const DataLayout &DL = M.getDataLayout();
+  // The 1st round: give module-absolute assignments
+  int NumAbsolutes = 0;
+  std::vector<GlobalVariable *> OrderedGVs;


SmallVector

Pierre-vh · 2025-11-04T09:17:58Z

llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp

+  }
+  OrderedGVs = sortByName(std::move(OrderedGVs));
+  for (GlobalVariable *GV : OrderedGVs) {
+    unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;


Suggested change

unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;

unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;

Pierre-vh · 2025-11-04T09:18:07Z

llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp

+  // The 2nd round: give a kernel-relative assignment for GV that
+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  std::vector<Function *> OrderedKernels;


SmallVector

Pierre-vh · 2025-11-04T09:18:17Z

llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp

+      // create a new GV used only by this kernel and its function.
+      auto NewGV = uniquifyGVPerKernel(M, GV, F);
+      Changed |= (NewGV != GV);
+      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;


Suggested change

unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;

unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;

github-actions · 2025-11-05T16:20:11Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Pierre-vh

This looks good to me but I'd like other reviewers to chime in before approving

Pierre-vh · 2025-11-07T08:38:20Z

llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll

@@ -0,0 +1,67 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }


Can you autogenerate this test instead ? It's easier to maintain that way
Also perhaps worth adding another RUN like w/ the new pass manager?

Updated this test with auto-generates checks.
new-pm RUN line has been added in new patch in #166731

ssahasra · 2025-11-11T12:03:48Z

I don't see the changes that remove the logic from LowerModuleLDS? Are they missing?

EDIT: Ah I see, they're in the next PR that enables the pass. Please update the PR description to reflect that

I don't really see why we need three separate commits for this. It's just existing code getting rebundled into a pass, and that pass is being enabled by default in its new intended place. All this should be done as a single change. This kind of split can hide complexity and errors. With a single change, if anything breaks, then either things were not as simple as they seem, or errors were introduced. But it will all be trackable to this one change.

skc7 · 2025-11-11T14:27:28Z

I don't see the changes that remove the logic from LowerModuleLDS? Are they missing?
EDIT: Ah I see, they're in the next PR that enables the pass. Please update the PR description to reflect that

I don't really see why we need three separate commits for this. It's just existing code getting rebundled into a pass, and that pass is being enabled by default in its new intended place. All this should be done as a single change. This kind of split can hide complexity and errors. With a single change, if anything breaks, then either things were not as simple as they seem, or errors were introduced. But it will all be trackable to this one change.

For any pass that will be newly introduced in the amdgpu backend, developers usually follow the approach of two PRs

Creating the pass, with both legacy and new pass manager implementations.
Adding the pass into the amdgpu pipeline.

Have followed the same for this change as well. Additionally, removal of changes from lower-module-lds were made into separate PR as per feedback from @shiltian

ssahasra · 2025-11-11T14:57:43Z

For any pass that will be newly introduced in the amdgpu backend, developers usually follow the approach of two PRs
1. Creating the pass, with both legacy and new pass manager implementations.

2. Adding the pass into the `amdgpu` pipeline.
Have followed the same for this change as well. Additionally, removal of changes from lower-module-lds were made into separate PR as per feedback from @shiltian

This is not a new pass. It's existing code bundled into a pass as a refactoring. I still think this should be a single change from the point of view of someone looking at the git log. All failures should be traceable to this one transaction.

EDIT: On second thought, I am not sure I agree that such a two-step convention exists or needs to exist. Other than catching build failures on obscure platforms, it's as if step 1 does not exist until step 2 has happened. I don't know what is the benefit of that.

shiltian · 2025-11-11T15:26:44Z

@ssahasra Ideally we want to keep the change as small but "atomic" as possible, but I think the argument makes sense. I can live with that.

ssahasra

Added a couple of nit-picks. Just pick one name: lower-exec-sync or lower-sync-primitives, and then update it in all places like file name, class name, option name, option description, class description etc. I see the word "primitive" used in only descriptions but nowhere else.

ssahasra · 2025-11-12T05:55:05Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

@@ -0,0 +1,241 @@
+//===-- AMDGPULowerExecSync.cpp -----------------------------------------===//


Remove the file name. It adds no new information, and can become stale if the file is renamed.

ssahasra · 2025-11-12T05:55:33Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU Lower Execution Synchronization pass performs lowering of


Suggested change

// AMDGPU Lower Execution Synchronization pass performs lowering of

// Lower

…dule-lds

skc7 · 2025-11-12T09:24:37Z

Added a couple of nit-picks. Just pick one name: lower-exec-sync or lower-sync-primitives, and then update it in all places like file name, class name, option name, option description, class description etc. I see the word "primitive" used in only descriptions but nowhere else.

Used "execution synchronization" everywhere after recent changes. Thanks.
Rebased to fix conflicts

Pierre-vh

LGTM but I'd like another person to approve as well before this lands

ssahasra · 2025-11-12T17:24:50Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+//
+//===----------------------------------------------------------------------===//
+//
+// Lower Execution Synchronization pass performs lowering of


Suggested change

// Lower Execution Synchronization pass performs lowering of

// Lower

That rest of the line is entirely unnecessary. Just "Lower LDS global variables with ..." works.

ssahasra

LGTM. Just one nitpick, but not very important.

arsenm · 2025-11-12T21:20:39Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+  bool NeedsReplacement = false;
+  for (Use &U : GV->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (isKernelLDS(F) && F != KF) {
+        NeedsReplacement = true;
+        break;
+      }
+    }
+  }
+  if (!NeedsReplacement)
+    return GV;


Can rewrite this without the temporary variable and break. Should also be over users, and could be any_of

arsenm · 2025-11-12T21:21:16Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+
+template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
+  sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();


This will need tie breakers for anonymous

arsenm · 2025-11-12T21:21:38Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  SmallVector<Function *> OrderedKernels;
+  for (auto &K : LDSUsesInfo.direct_access) {


structure binding, probably no &?

arsenm · 2025-11-12T21:22:08Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+    assert(isKernelLDS(K.first));
+    for (GlobalVariable *GV : K.second) {
+      if (isNamedBarrier(*GV))
+        K.second.erase(GV);


Probably not safe to do this erase inside the range loop over the set

arsenm · 2025-11-12T21:22:13Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+    OrderedGVs.clear();
+  }
+  // Also erase those special LDS variables from indirect_access.
+  for (auto &K : LDSUsesInfo.indirect_access) {


arsenm · 2025-11-12T21:24:35Z

llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp

+      if (!isNamedBarrier(*GV))
+        continue;
+
+      LDSUsesInfo.direct_access[F].erase(GV);


Avoid repeated map lookups

ssahasra · 2025-11-13T01:51:34Z

FWIW, @arsenm, you are commenting on the quality of code that was previously submitted with just a cursory review. These issues can be addressed separately and don't need to block this change.

llvm-ci · 2025-11-17T11:30:00Z

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-gcc-ubuntu-no-asserts running on doug-worker-6 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/202/builds/4220

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'SanitizerCommon-tsan-x86_64-Linux :: Linux/soft_rss_limit_mb_test.cpp' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/buildbot/buildbot-root/gcc-no-asserts/build/./bin/clang  --driver-mode=g++ -gline-tables-only -fsanitize=thread  -m64 -funwind-tables  -I/home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test -ldl -O2 /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -o /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# executed command: /home/buildbot/buildbot-root/gcc-no-asserts/build/./bin/clang --driver-mode=g++ -gline-tables-only -fsanitize=thread -m64 -funwind-tables -I/home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test -ldl -O2 /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -o /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# RUN: at line 5
env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=1      /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp 2>&1 | FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_1
# executed command: env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=1 /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# executed command: FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_1
# note: command had no output on stdout or stderr
# RUN: at line 6
env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0 not  /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp 2>&1 | FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
# executed command: env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0 not /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# executed command: FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 '--implicit-check-not=returned null'
# note: command had no output on stdout or stderr
# RUN: at line 10
env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not  /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp 2>&1 | FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
# executed command: env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# executed command: FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 '--implicit-check-not=returned null'
# .---command stderr------------
# | �[1m/home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp:72:24: �[0m�[0;1;31merror: �[0m�[1mCHECK_MAY_RETURN_0: expected string not found in input
�[0m# | �[1m�[0m// CHECK_MAY_RETURN_0: Some of the malloc calls returned non-null:
# | �[0;1;32m                       ^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:1:24: �[0m�[0;1;30mnote: �[0m�[1mscanning from here
�[0m# | �[1m�[0m[0] allocating 32 times
# | �[0;1;32m                       ^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:9:55: �[0m�[0;1;30mnote: �[0m�[1mpossible intended match here
�[0m# | �[1m�[0m==3294419==HINT: if you don't care about these errors you may set allocator_may_return_null=1
# | �[0;1;32m                                                      ^
�[0m# | �[0;1;32m�[0m
# | Input file: <stdin>
# | Check file: /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# | �[1m�[0m�[0;1;30m            1: �[0m�[1m�[0;1;46m[0] �[0mallocating 32 times�[0;1;46m �[0m
# | �[0;1;32mcheck:71           ^~~~~~~~~~~~~~~~~~~
�[0m# | �[0;1;32m�[0m�[0;1;32mnot:imp1       X~~~
�[0m# | �[0;1;32m�[0m�[0;1;31mcheck:72'0                            X error: no match found
�[0m# | �[0;1;31m�[0m�[0;1;30m            2: �[0m�[1m�[0;1;46m [0] �[0m
# | �[0;1;31mcheck:72'0     ~~~~~
...

This picks up the following fix: * llvm/llvm-project#165692

skc7 marked this pull request as ready for review November 3, 2025 09:15

llvmbot added the backend:AMDGPU label Nov 3, 2025

skc7 requested review from Pierre-vh, b-sumner and ssahasra November 3, 2025 09:15

skc7 requested review from ampandey-1995, arsenm and cmc-rep November 3, 2025 09:16

skc7 mentioned this pull request Nov 3, 2025

[AMDGPU][ASAN] Handle special GVs lowering in amdgpu-sw-lower-lds #161827

Closed

Pierre-vh reviewed Nov 4, 2025

View reviewed changes

skc7 changed the title ~~[AMDGPU] Add amdgpu-lower-special-lds pass to lower named-barrier LDS~~ [AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals Nov 5, 2025

This was referenced Nov 6, 2025

[AMDGPU] Enable amdgpu-lower-exec-sync pass in pipeline #165746

Closed

[AMDGPU] Remove named-barrier LDS lowering logic from amdgpu-lower-module-lds #166731

Closed

skc7 requested a review from shiltian November 6, 2025 09:09

Pierre-vh reviewed Nov 7, 2025

View reviewed changes

skc7 requested a review from Pierre-vh November 11, 2025 09:28

ssahasra reviewed Nov 12, 2025

View reviewed changes

skc7 added 7 commits November 12, 2025 14:38

[AMDGPU] Add amdgpu-lower-special-lds pass to lower named-barrier LDS

d18623a

Add comments

5bdcb2b

Elaborate description and namespace changes

9b224e8

Rename pass to amdgpu-lower-exec-sync

f30facc

Fix message for legacy pass

e8eb89b

Fix format issue

5fac212

autogenerate test amdgpu-lower-exec-sync.ll

8277767

skc7 added 8 commits November 12, 2025 14:47

[AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline

b01131a

Fix tests

926080b

update names

17791e8

remove changes from prior LDS lowerin passes

71d07c3

Update amdgpu-lower-exec-sync.ll test with llc RUN line

84594f2

[AMDGPU] Remove lowering named-barrier LDS logci from amdgpu-lower-mo…

862d702

…dule-lds

Update description of pass

ce406c2

Use execution synchronization everywhere

26199e2

skc7 force-pushed the users/skc7/amdgpu_lower_special_lds branch from ce7449e to 26199e2 Compare November 12, 2025 09:22

Pierre-vh approved these changes Nov 12, 2025

View reviewed changes

ssahasra reviewed Nov 12, 2025

View reviewed changes

ssahasra approved these changes Nov 12, 2025

View reviewed changes

arsenm reviewed Nov 12, 2025

View reviewed changes

Fix pass description

d9979cf

skc7 merged commit 49d5bb0 into main Nov 17, 2025
10 checks passed

skc7 deleted the users/skc7/amdgpu_lower_special_lds branch November 17, 2025 04:38

antiagainst added a commit to antiagainst/triton that referenced this pull request Nov 17, 2025

[Backend] Bump to llvm/llvm-project@49d5bb0ad0cb

ff2397f

This picks up the following fix: * llvm/llvm-project#165692

antiagainst mentioned this pull request Nov 17, 2025

[Backend] Bump to llvm/llvm-project@49d5bb0ad0cb triton-lang/triton#8744

Merged

antiagainst added a commit to antiagainst/triton that referenced this pull request Nov 17, 2025

[Backend] Bump to llvm/llvm-project@49d5bb0ad0cb

79d8e9b

This picks up the following fix: * llvm/llvm-project#165692

antiagainst added a commit to triton-lang/triton that referenced this pull request Nov 17, 2025

[Backend] Bump to llvm/llvm-project@49d5bb0ad0cb (#8744)

3a0a893

This picks up the following fix: * llvm/llvm-project#165692

antiagainst mentioned this pull request Nov 19, 2025

[Backend] Bump to llvm/llvm-project@49d5bb0ad0cb triton-lang/triton#8766

Draft

		@@ -0,0 +1,234 @@
		//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//

	unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
	unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;

		@@ -0,0 +1,67 @@
		; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 \| FileCheck %s

		%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }

		@@ -0,0 +1,241 @@
		//===-- AMDGPULowerExecSync.cpp -----------------------------------------===//

	// AMDGPU Lower Execution Synchronization pass performs lowering of
	// Lower

	// Lower Execution Synchronization pass performs lowering of
	// Lower

[AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals #165692

[AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals #165692

Uh oh!

Conversation

skc7 commented Oct 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 3, 2025

Uh oh!

Pierre-vh left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Nov 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Pierre-vh left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ssahasra commented Nov 11, 2025

Uh oh!

skc7 commented Nov 11, 2025

Uh oh!

ssahasra commented Nov 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

shiltian commented Nov 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ssahasra left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

skc7 commented Nov 12, 2025

Uh oh!

Pierre-vh left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ssahasra left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

skc7 commented Oct 30, 2025 •

edited

Loading

Pierre-vh left a comment •

edited

Loading

github-actions bot commented Nov 5, 2025 •

edited

Loading

ssahasra commented Nov 11, 2025 •

edited

Loading

shiltian commented Nov 11, 2025 •

edited

Loading