Skip to content

Conversation

@skc7
Copy link
Contributor

@skc7 skc7 commented Oct 30, 2025

This PR introduces amdgpu-lower-exec-sync pass which specifically lowers named-barrier LDS globals introduced by #114550 .

Changes include:

  • Moving the logic of lowering named-barrier LDS globals from amdgpu-lower-module-lds pass to this new pass.

  • This PR adds the pass, remove the existing lowering logic for named-barrier LDS in amdgpu-lower-module-lds

See #161827 for discussion on this topic.

@skc7 skc7 marked this pull request as ready for review November 3, 2025 09:15
@llvmbot
Copy link
Member

llvmbot commented Nov 3, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Chaitanya (skc7)

Changes

This PR introduces amdgpu-lower-special-lds pass which specifically lowers named-barrier LDS globals introduced by #114550 .

Changes include:

  • Moving the logic of lowering named-barrier LDS globals from amdgpu-lower-module-lds pass to this new pass.

  • #165746 adds the "amdgpu-lower-special-lds" pass to pipeline, which runs prior to LDS lowering passes.

See #161827 for discussion on this topic.


Full diff: https://github.com/llvm/llvm-project/pull/165692.diff

6 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+9)
  • (added) llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp (+234)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def (+1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+1)
  • (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
  • (added) llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll (+67)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cd8b2495a4250..d878cbfce07f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
   bool GlobalOpt;
 };
 
+void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &);
+extern char &AMDGPULowerSpecialLDSLegacyPassID;
+ModulePass *createAMDGPULowerSpecialLDSLegacyPass();
+
+struct AMDGPULowerSpecialLDSPass : PassInfoMixin<AMDGPULowerSpecialLDSPass> {
+  AMDGPULowerSpecialLDSPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
 extern char &AMDGPUSwLowerLDSLegacyPassID;
 ModulePass *
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
new file mode 100644
index 0000000000000..5534a3ba6382e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -0,0 +1,234 @@
+//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the named barriers LDS globals which needs
+// special address assignment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-special-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+                                           Function *KF) {
+  bool NeedsReplacement = false;
+  for (Use &U : GV->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (isKernelLDS(F) && F != KF) {
+        NeedsReplacement = true;
+        break;
+      }
+    }
+  }
+  if (!NeedsReplacement)
+    return GV;
+  // Create a new GV used only by this kernel and its function
+  GlobalVariable *NewGV = new GlobalVariable(
+      M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+      GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+      GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+  NewGV->copyAttributesFrom(GV);
+  for (Use &U : make_early_inc_range(GV->uses())) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      Function *F = I->getFunction();
+      if (!isKernelLDS(F) || F == KF) {
+        U.getUser()->replaceUsesOfWith(GV, NewGV);
+      }
+    }
+  }
+  return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+                                     uint32_t Address) {
+  LLVMContext &Ctx = M->getContext();
+  auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+  auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+  auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+  GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                  MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+  llvm::sort(V, [](const auto *L, const auto *R) {
+    return L->getName() < R->getName();
+  });
+  return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerSpecialLDSVariables(
+    Module &M, LDSUsesInfoTy &LDSUsesInfo,
+    VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+  // The 1st round: give module-absolute assignments
+  int NumAbsolutes = 0;
+  std::vector<GlobalVariable *> OrderedGVs;
+  for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+    GlobalVariable *GV = K.first;
+    if (!isNamedBarrier(*GV))
+      continue;
+    // give a module-absolute assignment if it is indirectly accessed by
+    // multiple kernels. This is not precise, but we don't want to duplicate
+    // a function when it is called by multiple kernels.
+    if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+      OrderedGVs.push_back(GV);
+    } else {
+      // leave it to the 2nd round, which will give a kernel-relative
+      // assignment if it is only indirectly accessed by one kernel
+      LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+    }
+    LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+  }
+  OrderedGVs = sortByName(std::move(OrderedGVs));
+  for (GlobalVariable *GV : OrderedGVs) {
+    unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+    unsigned BarId = NumAbsolutes + 1;
+    unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+    NumAbsolutes += BarCnt;
+
+    // 4 bits for alignment, 5 bits for the barrier num,
+    // 3 bits for the barrier scope
+    unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+    recordLDSAbsoluteAddress(&M, GV, Offset);
+  }
+  OrderedGVs.clear();
+
+  // The 2nd round: give a kernel-relative assignment for GV that
+  // either only indirectly accessed by single kernel or only directly
+  // accessed by multiple kernels.
+  std::vector<Function *> OrderedKernels;
+  for (auto &K : LDSUsesInfo.direct_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    OrderedKernels.push_back(F);
+  }
+  OrderedKernels = sortByName(std::move(OrderedKernels));
+
+  llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+  for (Function *F : OrderedKernels) {
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+      if (!isNamedBarrier(*GV))
+        continue;
+
+      LDSUsesInfo.direct_access[F].erase(GV);
+      if (GV->isAbsoluteSymbolRef()) {
+        // already assigned
+        continue;
+      }
+      OrderedGVs.push_back(GV);
+    }
+    OrderedGVs = sortByName(std::move(OrderedGVs));
+    for (GlobalVariable *GV : OrderedGVs) {
+      // GV could also be used directly by other kernels. If so, we need to
+      // create a new GV used only by this kernel and its function.
+      auto NewGV = uniquifyGVPerKernel(M, GV, F);
+      Changed |= (NewGV != GV);
+      unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+      unsigned BarId = Kernel2BarId[F];
+      BarId += NumAbsolutes + 1;
+      unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+      Kernel2BarId[F] += BarCnt;
+      unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+      recordLDSAbsoluteAddress(&M, NewGV, Offset);
+    }
+    OrderedGVs.clear();
+  }
+  // Also erase those special LDS variables from indirect_access.
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    assert(isKernelLDS(K.first));
+    for (GlobalVariable *GV : K.second) {
+      if (isNamedBarrier(*GV))
+        K.second.erase(GV);
+    }
+  }
+  return Changed;
+}
+
+static bool runLowerSpecialLDS(Module &M) {
+  CallGraph CG = CallGraph(M);
+  bool Changed = false;
+  Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+  // For each kernel, what variables does it access directly or through
+  // callees
+  LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+  // For each variable accessed through callees, which kernels access it
+  VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+  for (auto &K : LDSUsesInfo.indirect_access) {
+    Function *F = K.first;
+    assert(isKernelLDS(F));
+    for (GlobalVariable *GV : K.second) {
+      LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+    }
+  }
+
+  if (LDSUsesInfo.HasSpecialGVs) {
+    // Special LDS variables need special address assignment
+    Changed |= lowerSpecialLDSVariables(
+        M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+  }
+  return Changed;
+}
+
+class AMDGPULowerSpecialLDSLegacy : public ModulePass {
+public:
+  static char ID;
+  AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerSpecialLDSLegacy::ID = 0;
+char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+                      "AMDGPU lowering of special LDS variables", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+                    "AMDGPU lowering of special LDS variables", false, false)
+
+bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) {
+  return runLowerSpecialLDS(M);
+}
+
+ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() {
+  return new AMDGPULowerSpecialLDSLegacy();
+}
+
+PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  return runLowerSpecialLDS(M) ? PreservedAnalyses::none()
+                               : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index bf6f1a9dbf576..a2fd53ac1b8ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-perf-hint",
 MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass())
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 #undef MODULE_PASS
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 75a94ac891819..916826ea169aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -567,6 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILoadStoreOptimizerLegacyPass(*PR);
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
+  initializeAMDGPULowerSpecialLDSLegacyPass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
   initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e5293c706..c401926e22a87 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
   AMDGPUPrepareAGPRAlloc.cpp
+  AMDGPULowerSpecialLDS.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
new file mode 100644
index 0000000000000..28d94f3d42622
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+
+@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
+; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+
+define void @func1() {
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define void @func2() {
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier()
+    call void @func1()
+    call void @func2()
+    ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+    call void @func2()
+    ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+; CHECK: !0 = !{i32 8396816, i32 8396817}
+; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
+; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}

Copy link
Contributor

@Pierre-vh Pierre-vh left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the changes that remove the logic from LowerModuleLDS? Are they missing?

EDIT: Ah I see, they're in the next PR that enables the pass.
Please update the PR description to reflect that

@@ -0,0 +1,234 @@
//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe wait for other reviewers to give feedback first, but I think we should name this "LowerBarrierGVs" instead.

I really don't want to see more "special LDS variables" proliferate. Making the pass specific to the barrier GV would help to not encourage others from adding another "special LDS not not really LDS" GV.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree we don't want to say "special", but "barrier" is also not accurate because we will soon need to lower semaphores as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"amdgpu-lower-sync-lds" Could be used?

Sync here relates to synchronization primitives.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps "AMDGPULowerExecSync" ("lower execution synchronization") ?

I assume if we want to do other operations on barriers in the future, like add fallback lowerings, or fixup/optimize intrinsics, we'd do it there as well (because it's the path of least resistance, but also it makes sense to deal with all of it in one place instead of spreading it). I would still like to see the GV be removed from the LDS AS so I wouldn't make this LDS specific either.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"AMDGPULowerSyncPrimitives" sounds better ... "ExecSync" induces furious head scratching trying to understand what it could refer to.

//
//===----------------------------------------------------------------------===//
//
// This pass lowers the named barriers LDS globals which needs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Elaborate a bit more, add a little code snippet with before/after. e.g.: This assigns a unique barrier ID to every !amdgpu.named.barrier GV, and encodes the barrier ID in the !absolute_symbol metadata of the GV so LDS lowering pass can safely handle it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated. Thanks for feedback.

const DataLayout &DL = M.getDataLayout();
// The 1st round: give module-absolute assignments
int NumAbsolutes = 0;
std::vector<GlobalVariable *> OrderedGVs;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SmallVector

}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;

// The 2nd round: give a kernel-relative assignment for GV that
// either only indirectly accessed by single kernel or only directly
// accessed by multiple kernels.
std::vector<Function *> OrderedKernels;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SmallVector

// create a new GV used only by this kernel and its function.
auto NewGV = uniquifyGVPerKernel(M, GV, F);
Changed |= (NewGV != GV);
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;

@skc7 skc7 changed the title [AMDGPU] Add amdgpu-lower-special-lds pass to lower named-barrier LDS [AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals Nov 5, 2025
@github-actions
Copy link

github-actions bot commented Nov 5, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Contributor

@Pierre-vh Pierre-vh left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good to me but I'd like other reviewers to chime in before approving

@@ -0,0 +1,67 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s

%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you autogenerate this test instead ? It's easier to maintain that way
Also perhaps worth adding another RUN like w/ the new pass manager?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated this test with auto-generates checks.
new-pm RUN line has been added in new patch in #166731

@skc7 skc7 requested a review from Pierre-vh November 11, 2025 09:28
@ssahasra
Copy link
Collaborator

I don't see the changes that remove the logic from LowerModuleLDS? Are they missing?

EDIT: Ah I see, they're in the next PR that enables the pass. Please update the PR description to reflect that

I don't really see why we need three separate commits for this. It's just existing code getting rebundled into a pass, and that pass is being enabled by default in its new intended place. All this should be done as a single change. This kind of split can hide complexity and errors. With a single change, if anything breaks, then either things were not as simple as they seem, or errors were introduced. But it will all be trackable to this one change.

@skc7
Copy link
Contributor Author

skc7 commented Nov 11, 2025

I don't see the changes that remove the logic from LowerModuleLDS? Are they missing?
EDIT: Ah I see, they're in the next PR that enables the pass. Please update the PR description to reflect that

I don't really see why we need three separate commits for this. It's just existing code getting rebundled into a pass, and that pass is being enabled by default in its new intended place. All this should be done as a single change. This kind of split can hide complexity and errors. With a single change, if anything breaks, then either things were not as simple as they seem, or errors were introduced. But it will all be trackable to this one change.

For any pass that will be newly introduced in the amdgpu backend, developers usually follow the approach of two PRs

  1. Creating the pass, with both legacy and new pass manager implementations.
  2. Adding the pass into the amdgpu pipeline.

Have followed the same for this change as well. Additionally, removal of changes from lower-module-lds were made into separate PR as per feedback from @shiltian

@ssahasra
Copy link
Collaborator

ssahasra commented Nov 11, 2025

For any pass that will be newly introduced in the amdgpu backend, developers usually follow the approach of two PRs

1. Creating the pass, with both legacy and new pass manager implementations.

2. Adding the pass into the `amdgpu` pipeline.

Have followed the same for this change as well. Additionally, removal of changes from lower-module-lds were made into separate PR as per feedback from @shiltian

This is not a new pass. It's existing code bundled into a pass as a refactoring. I still think this should be a single change from the point of view of someone looking at the git log. All failures should be traceable to this one transaction.

EDIT: On second thought, I am not sure I agree that such a two-step convention exists or needs to exist. Other than catching build failures on obscure platforms, it's as if step 1 does not exist until step 2 has happened. I don't know what is the benefit of that.

@shiltian
Copy link
Contributor

shiltian commented Nov 11, 2025

@ssahasra Ideally we want to keep the change as small but "atomic" as possible, but I think the argument makes sense. I can live with that.

Copy link
Collaborator

@ssahasra ssahasra left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a couple of nit-picks. Just pick one name: lower-exec-sync or lower-sync-primitives, and then update it in all places like file name, class name, option name, option description, class description etc. I see the word "primitive" used in only descriptions but nowhere else.

@@ -0,0 +1,241 @@
//===-- AMDGPULowerExecSync.cpp -----------------------------------------===//
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove the file name. It adds no new information, and can become stale if the file is renamed.

//
//===----------------------------------------------------------------------===//
//
// AMDGPU Lower Execution Synchronization pass performs lowering of
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// AMDGPU Lower Execution Synchronization pass performs lowering of
// Lower

@skc7 skc7 force-pushed the users/skc7/amdgpu_lower_special_lds branch from ce7449e to 26199e2 Compare November 12, 2025 09:22
@skc7
Copy link
Contributor Author

skc7 commented Nov 12, 2025

Added a couple of nit-picks. Just pick one name: lower-exec-sync or lower-sync-primitives, and then update it in all places like file name, class name, option name, option description, class description etc. I see the word "primitive" used in only descriptions but nowhere else.

Used "execution synchronization" everywhere after recent changes. Thanks.
Rebased to fix conflicts

Copy link
Contributor

@Pierre-vh Pierre-vh left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM but I'd like another person to approve as well before this lands

//
//===----------------------------------------------------------------------===//
//
// Lower Execution Synchronization pass performs lowering of
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Lower Execution Synchronization pass performs lowering of
// Lower

That rest of the line is entirely unnecessary. Just "Lower LDS global variables with ..." works.

Copy link
Collaborator

@ssahasra ssahasra left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Just one nitpick, but not very important.

Comment on lines +44 to +55
bool NeedsReplacement = false;
for (Use &U : GV->uses()) {
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
Function *F = I->getFunction();
if (isKernelLDS(F) && F != KF) {
NeedsReplacement = true;
break;
}
}
}
if (!NeedsReplacement)
return GV;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can rewrite this without the temporary variable and break. Should also be over users, and could be any_of


template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
sort(V, [](const auto *L, const auto *R) {
return L->getName() < R->getName();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will need tie breakers for anonymous

// either only indirectly accessed by single kernel or only directly
// accessed by multiple kernels.
SmallVector<Function *> OrderedKernels;
for (auto &K : LDSUsesInfo.direct_access) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

structure binding, probably no &?

assert(isKernelLDS(K.first));
for (GlobalVariable *GV : K.second) {
if (isNamedBarrier(*GV))
K.second.erase(GV);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not safe to do this erase inside the range loop over the set

OrderedGVs.clear();
}
// Also erase those special LDS variables from indirect_access.
for (auto &K : LDSUsesInfo.indirect_access) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No auto

if (!isNamedBarrier(*GV))
continue;

LDSUsesInfo.direct_access[F].erase(GV);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid repeated map lookups

@ssahasra
Copy link
Collaborator

FWIW, @arsenm, you are commenting on the quality of code that was previously submitted with just a cursory review. These issues can be addressed separately and don't need to block this change.

@skc7 skc7 merged commit 49d5bb0 into main Nov 17, 2025
10 checks passed
@skc7 skc7 deleted the users/skc7/amdgpu_lower_special_lds branch November 17, 2025 04:38
@llvm-ci
Copy link
Collaborator

llvm-ci commented Nov 17, 2025

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-gcc-ubuntu-no-asserts running on doug-worker-6 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/202/builds/4220

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'SanitizerCommon-tsan-x86_64-Linux :: Linux/soft_rss_limit_mb_test.cpp' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/buildbot/buildbot-root/gcc-no-asserts/build/./bin/clang  --driver-mode=g++ -gline-tables-only -fsanitize=thread  -m64 -funwind-tables  -I/home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test -ldl -O2 /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -o /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# executed command: /home/buildbot/buildbot-root/gcc-no-asserts/build/./bin/clang --driver-mode=g++ -gline-tables-only -fsanitize=thread -m64 -funwind-tables -I/home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test -ldl -O2 /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -o /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# RUN: at line 5
env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=1      /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp 2>&1 | FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_1
# executed command: env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=1 /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# executed command: FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_1
# note: command had no output on stdout or stderr
# RUN: at line 6
env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0 not  /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp 2>&1 | FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
# executed command: env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0 not /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# executed command: FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 '--implicit-check-not=returned null'
# note: command had no output on stdout or stderr
# RUN: at line 10
env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not  /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp 2>&1 | FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
# executed command: env TSAN_OPTIONS=soft_rss_limit_mb=220:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not /home/buildbot/buildbot-root/gcc-no-asserts/build/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/tsan-x86_64-Linux/Linux/Output/soft_rss_limit_mb_test.cpp.tmp
# note: command had no output on stdout or stderr
# executed command: FileCheck /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp -check-prefix=CHECK_MAY_RETURN_0 '--implicit-check-not=returned null'
# .---command stderr------------
# | �[1m/home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp:72:24: �[0m�[0;1;31merror: �[0m�[1mCHECK_MAY_RETURN_0: expected string not found in input
�[0m# | �[1m�[0m// CHECK_MAY_RETURN_0: Some of the malloc calls returned non-null:
# | �[0;1;32m                       ^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:1:24: �[0m�[0;1;30mnote: �[0m�[1mscanning from here
�[0m# | �[1m�[0m[0] allocating 32 times
# | �[0;1;32m                       ^
�[0m# | �[0;1;32m�[0m�[1m<stdin>:9:55: �[0m�[0;1;30mnote: �[0m�[1mpossible intended match here
�[0m# | �[1m�[0m==3294419==HINT: if you don't care about these errors you may set allocator_may_return_null=1
# | �[0;1;32m                                                      ^
�[0m# | �[0;1;32m�[0m
# | Input file: <stdin>
# | Check file: /home/buildbot/buildbot-root/gcc-no-asserts/llvm-project/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# | �[1m�[0m�[0;1;30m            1: �[0m�[1m�[0;1;46m[0] �[0mallocating 32 times�[0;1;46m �[0m
# | �[0;1;32mcheck:71           ^~~~~~~~~~~~~~~~~~~
�[0m# | �[0;1;32m�[0m�[0;1;32mnot:imp1       X~~~
�[0m# | �[0;1;32m�[0m�[0;1;31mcheck:72'0                            X error: no match found
�[0m# | �[0;1;31m�[0m�[0;1;30m            2: �[0m�[1m�[0;1;46m [0] �[0m
# | �[0;1;31mcheck:72'0     ~~~~~
...

antiagainst added a commit to antiagainst/triton that referenced this pull request Nov 17, 2025
antiagainst added a commit to antiagainst/triton that referenced this pull request Nov 17, 2025
antiagainst added a commit to triton-lang/triton that referenced this pull request Nov 17, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

8 participants