From d18623a3ebed2bf2a968dd766e4400c9fd2250f6 Mon Sep 17 00:00:00 2001 From: skc7 Date: Thu, 30 Oct 2025 14:51:45 +0530 Subject: [PATCH 01/16] [AMDGPU] Add amdgpu-lower-special-lds pass to lower named-barrier LDS --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 + .../Target/AMDGPU/AMDGPULowerSpecialLDS.cpp | 231 ++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + .../AMDGPU/amdgpu-lower-special-lds.ll | 67 +++++ 6 files changed, 310 insertions(+) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 67042b700c047..802cf0b40e5e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin { bool GlobalOpt; }; +void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &); +extern char &AMDGPULowerSpecialLDSLegacyPassID; +ModulePass *createAMDGPULowerSpecialLDSLegacyPass(); + +struct AMDGPULowerSpecialLDSPass : PassInfoMixin { + AMDGPULowerSpecialLDSPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &); extern char &AMDGPUSwLowerLDSLegacyPassID; ModulePass * diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp new file mode 100644 index 0000000000000..56161dacc49e7 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp @@ -0,0 +1,231 @@ +//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers the named barriers LDS globals which needs +// special address assignment. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMemoryUtils.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +#include + +#define DEBUG_TYPE "amdgpu-lower-special-lds" + +using namespace llvm; +using namespace AMDGPU; + +namespace { + +static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, + Function *KF) { + bool NeedsReplacement = false; + for (Use &U : GV->uses()) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (isKernelLDS(F) && F != KF) { + NeedsReplacement = true; + break; + } + } + } + if (!NeedsReplacement) + return GV; + // Create a new GV used only by this kernel and its function + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + for (Use &U : make_early_inc_range(GV->uses())) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (!isKernelLDS(F) || F == KF) { + U.getUser()->replaceUsesOfWith(GV, NewGV); + } + } + } + return NewGV; +} + +static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, + uint32_t Address) { + // Write the specified address into metadata where it can be retrieved by + // the assembler. Format is a half open range, [Address Address+1) + LLVMContext &Ctx = M->getContext(); + auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(Ctx, {MinC, MaxC})); +} + +template std::vector sortByName(std::vector &&V) { + llvm::sort(V, [](const auto *L, const auto *R) { + return L->getName() < R->getName(); + }); + return {std::move(V)}; +} + +bool lowerSpecialLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + // The 1st round: give module-absolute assignments + int NumAbsolutes = 0; + std::vector OrderedGVs; + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + GlobalVariable *GV = K.first; + if (!isNamedBarrier(*GV)) + continue; + // give a module-absolute assignment if it is indirectly accessed by + // multiple kernels. This is not precise, but we don't want to duplicate + // a function when it is called by multiple kernels. + if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { + OrderedGVs.push_back(GV); + } else { + // leave it to the 2nd round, which will give a kernel-relative + // assignment if it is only indirectly accessed by one kernel + LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); + } + LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + NumAbsolutes += BarCnt; + + // 4 bits for alignment, 5 bits for the barrier num, + // 3 bits for the barrier scope + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, GV, Offset); + } + OrderedGVs.clear(); + + // The 2nd round: give a kernel-relative assignment for GV that + // either only indirectly accessed by single kernel or only directly + // accessed by multiple kernels. + std::vector OrderedKernels; + for (auto &K : LDSUsesInfo.direct_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + OrderedKernels.push_back(F); + } + OrderedKernels = sortByName(std::move(OrderedKernels)); + + llvm::DenseMap Kernel2BarId; + for (Function *F : OrderedKernels) { + for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { + if (!isNamedBarrier(*GV)) + continue; + + LDSUsesInfo.direct_access[F].erase(GV); + if (GV->isAbsoluteSymbolRef()) { + // already assigned + continue; + } + OrderedGVs.push_back(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + // GV could also be used directly by other kernels. If so, we need to + // create a new GV used only by this kernel and its function. + auto NewGV = uniquifyGVPerKernel(M, GV, F); + Changed |= (NewGV != GV); + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = Kernel2BarId[F]; + BarId += NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + Kernel2BarId[F] += BarCnt; + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, NewGV, Offset); + } + OrderedGVs.clear(); + } + // Also erase those special LDS variables from indirect_access. + for (auto &K : LDSUsesInfo.indirect_access) { + assert(isKernelLDS(K.first)); + for (GlobalVariable *GV : K.second) { + if (isNamedBarrier(*GV)) + K.second.erase(GV); + } + } + return Changed; +} + +bool runLowerSpecialLDS(Module &M) { + CallGraph CG = CallGraph(M); + bool Changed = false; + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + + // For each kernel, what variables does it access directly or through + // callees + LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + + // For each variable accessed through callees, which kernels access it + VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + for (GlobalVariable *GV : K.second) { + LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + } + } + + if (LDSUsesInfo.HasSpecialGVs) { + // Special LDS variables need special address assignment + Changed |= lowerSpecialLDSVariables( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); + } + return Changed; +} + +class AMDGPULowerSpecialLDSLegacy : public ModulePass { +public: + static char ID; + AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {} + bool runOnModule(Module &M) override; +}; +} // namespace + +char AMDGPULowerSpecialLDSLegacy::ID = 0; +char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID; + +INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE, + "AMDGPU lowering of special LDS variables", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE, + "AMDGPU lowering of special LDS variables", false, false) + +bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) { + return runLowerSpecialLDS(M); +} + +ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() { + return new AMDGPULowerSpecialLDSLegacy(); +} + +PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M, + ModuleAnalysisManager &AM) { + return runLowerSpecialLDS(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index bf6f1a9dbf576..a2fd53ac1b8ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-perf-hint", MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this)) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) +MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass()) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) #undef MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b87b54ffc4f12..01495a3708ce3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -567,6 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); + initializeAMDGPULowerSpecialLDSLegacyPass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index a1e0e5293c706..c401926e22a87 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp AMDGPUPrepareAGPRAlloc.cpp + AMDGPULowerSpecialLDS.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll new file mode 100644 index 0000000000000..28d94f3d42622 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll @@ -0,0 +1,67 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } + +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison + +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0 +; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1 +; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2 +; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2 + +define void @func1() { + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() { + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + + call void @func2() + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +; CHECK: !0 = !{i32 8396816, i32 8396817} +; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913} +; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849} From 5bdcb2b5f8d08f2ac3b59045860c9e29c6f24cb2 Mon Sep 17 00:00:00 2001 From: skc7 Date: Mon, 3 Nov 2025 12:43:02 +0530 Subject: [PATCH 02/16] Add comments --- llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp index 56161dacc49e7..5534a3ba6382e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp @@ -18,7 +18,6 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" @@ -33,6 +32,8 @@ using namespace AMDGPU; namespace { +// If GV is also used directly by other kernels, create a new GV +// used only by this kernel and its function. static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, Function *KF) { bool NeedsReplacement = false; @@ -64,10 +65,10 @@ static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, return NewGV; } +// Write the specified address into metadata where it can be retrieved by +// the assembler. Format is a half open range, [Address Address+1) static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, uint32_t Address) { - // Write the specified address into metadata where it can be retrieved by - // the assembler. Format is a half open range, [Address Address+1) LLVMContext &Ctx = M->getContext(); auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); @@ -83,7 +84,8 @@ template std::vector sortByName(std::vector &&V) { return {std::move(V)}; } -bool lowerSpecialLDSVariables( +// Main utility function for special LDS variables lowering. +static bool lowerSpecialLDSVariables( Module &M, LDSUsesInfoTy &LDSUsesInfo, VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { bool Changed = false; @@ -172,7 +174,7 @@ bool lowerSpecialLDSVariables( return Changed; } -bool runLowerSpecialLDS(Module &M) { +static bool runLowerSpecialLDS(Module &M) { CallGraph CG = CallGraph(M); bool Changed = false; Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); @@ -205,6 +207,7 @@ class AMDGPULowerSpecialLDSLegacy : public ModulePass { AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {} bool runOnModule(Module &M) override; }; + } // namespace char AMDGPULowerSpecialLDSLegacy::ID = 0; From 9b224e85b1da9ba5f3118dca60bffc7fc4cbf008 Mon Sep 17 00:00:00 2001 From: skc7 Date: Wed, 5 Nov 2025 15:19:31 +0530 Subject: [PATCH 03/16] Elaborate description and namespace changes --- .../Target/AMDGPU/AMDGPULowerSpecialLDS.cpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp index 5534a3ba6382e..ae869a841f7f0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp @@ -6,8 +6,12 @@ // //===----------------------------------------------------------------------===// // -// This pass lowers the named barriers LDS globals which needs -// special address assignment. +// This pass performs lowering of LDS global variables with target extension +// type "amdgpu.named.barrier" that require specialized address assignment. It +// assigns a unique barrier identifier to each named-barrier LDS variable and +// encodes this identifier within the !absolute_symbol metadata of that global. +// This encoding ensures that subsequent LDS lowering passes can process these +// barriers correctly without conflicts. // //===----------------------------------------------------------------------===// @@ -77,8 +81,8 @@ static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, MDNode::get(Ctx, {MinC, MaxC})); } -template std::vector sortByName(std::vector &&V) { - llvm::sort(V, [](const auto *L, const auto *R) { +template SmallVector sortByName(SmallVector &&V) { + sort(V, [](const auto *L, const auto *R) { return L->getName() < R->getName(); }); return {std::move(V)}; @@ -92,7 +96,7 @@ static bool lowerSpecialLDSVariables( const DataLayout &DL = M.getDataLayout(); // The 1st round: give module-absolute assignments int NumAbsolutes = 0; - std::vector OrderedGVs; + SmallVector OrderedGVs; for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { GlobalVariable *GV = K.first; if (!isNamedBarrier(*GV)) @@ -111,7 +115,7 @@ static bool lowerSpecialLDSVariables( } OrderedGVs = sortByName(std::move(OrderedGVs)); for (GlobalVariable *GV : OrderedGVs) { - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; unsigned BarId = NumAbsolutes + 1; unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; NumAbsolutes += BarCnt; @@ -126,7 +130,7 @@ static bool lowerSpecialLDSVariables( // The 2nd round: give a kernel-relative assignment for GV that // either only indirectly accessed by single kernel or only directly // accessed by multiple kernels. - std::vector OrderedKernels; + SmallVector OrderedKernels; for (auto &K : LDSUsesInfo.direct_access) { Function *F = K.first; assert(isKernelLDS(F)); @@ -134,7 +138,7 @@ static bool lowerSpecialLDSVariables( } OrderedKernels = sortByName(std::move(OrderedKernels)); - llvm::DenseMap Kernel2BarId; + DenseMap Kernel2BarId; for (Function *F : OrderedKernels) { for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { if (!isNamedBarrier(*GV)) @@ -153,7 +157,7 @@ static bool lowerSpecialLDSVariables( // create a new GV used only by this kernel and its function. auto NewGV = uniquifyGVPerKernel(M, GV, F); Changed |= (NewGV != GV); - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; unsigned BarId = Kernel2BarId[F]; BarId += NumAbsolutes + 1; unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; From f30faccec86c4e91e05db9017515d4e830ef6a10 Mon Sep 17 00:00:00 2001 From: skc7 Date: Wed, 5 Nov 2025 21:06:43 +0530 Subject: [PATCH 04/16] Rename pass to amdgpu-lower-exec-sync --- llvm/lib/Target/AMDGPU/AMDGPU.h | 10 ++-- ...SpecialLDS.cpp => AMDGPULowerExecSync.cpp} | 47 ++++++++++--------- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 2 +- ...ecial-lds.ll => amdgpu-lower-exec-sync.ll} | 2 +- 6 files changed, 33 insertions(+), 32 deletions(-) rename llvm/lib/Target/AMDGPU/{AMDGPULowerSpecialLDS.cpp => AMDGPULowerExecSync.cpp} (84%) rename llvm/test/CodeGen/AMDGPU/{amdgpu-lower-special-lds.ll => amdgpu-lower-exec-sync.ll} (97%) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 802cf0b40e5e9..5af2a2755cec3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -298,12 +298,12 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin { bool GlobalOpt; }; -void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &); -extern char &AMDGPULowerSpecialLDSLegacyPassID; -ModulePass *createAMDGPULowerSpecialLDSLegacyPass(); +void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &); +extern char &AMDGPULowerExecSyncLegacyPassID; +ModulePass *createAMDGPULowerExecSyncLegacyPass(); -struct AMDGPULowerSpecialLDSPass : PassInfoMixin { - AMDGPULowerSpecialLDSPass() {} +struct AMDGPULowerExecSyncPass : PassInfoMixin { + AMDGPULowerExecSyncPass() {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp similarity index 84% rename from llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp rename to llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp index ae869a841f7f0..4b640362e8887 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===// +//===-- AMDGPULowerExecSync.cpp -----------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,10 +6,11 @@ // //===----------------------------------------------------------------------===// // -// This pass performs lowering of LDS global variables with target extension -// type "amdgpu.named.barrier" that require specialized address assignment. It -// assigns a unique barrier identifier to each named-barrier LDS variable and -// encodes this identifier within the !absolute_symbol metadata of that global. +// AMDGPU Lower Execution Synchronization pass performs lowering of +// LDS global variables with target extension type "amdgpu.named.barrier" +// that require specialized address assignment. It assigns a unique +// barrier identifier to each named-barrier LDS variable and encodes +// this identifier within the !absolute_symbol metadata of that global. // This encoding ensures that subsequent LDS lowering passes can process these // barriers correctly without conflicts. // @@ -29,7 +30,7 @@ #include -#define DEBUG_TYPE "amdgpu-lower-special-lds" +#define DEBUG_TYPE "amdgpu-lower-exec-sync" using namespace llvm; using namespace AMDGPU; @@ -89,7 +90,7 @@ template SmallVector sortByName(SmallVector &&V) { } // Main utility function for special LDS variables lowering. -static bool lowerSpecialLDSVariables( +static bool lowerExecSyncGlobalVariables( Module &M, LDSUsesInfoTy &LDSUsesInfo, VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { bool Changed = false; @@ -178,7 +179,7 @@ static bool lowerSpecialLDSVariables( return Changed; } -static bool runLowerSpecialLDS(Module &M) { +static bool runLowerExecSyncGlobals(Module &M) { CallGraph CG = CallGraph(M); bool Changed = false; Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); @@ -199,40 +200,40 @@ static bool runLowerSpecialLDS(Module &M) { if (LDSUsesInfo.HasSpecialGVs) { // Special LDS variables need special address assignment - Changed |= lowerSpecialLDSVariables( + Changed |= lowerExecSyncGlobalVariables( M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); } return Changed; } -class AMDGPULowerSpecialLDSLegacy : public ModulePass { +class AMDGPULowerExecSyncLegacy : public ModulePass { public: static char ID; - AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {} + AMDGPULowerExecSyncLegacy() : ModulePass(ID) {} bool runOnModule(Module &M) override; }; } // namespace -char AMDGPULowerSpecialLDSLegacy::ID = 0; -char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID; +char AMDGPULowerExecSyncLegacy::ID = 0; +char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; -INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, "AMDGPU lowering of special LDS variables", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE, +INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, "AMDGPU lowering of special LDS variables", false, false) -bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) { - return runLowerSpecialLDS(M); +bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { + return runLowerExecSyncGlobals(M); } -ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() { - return new AMDGPULowerSpecialLDSLegacy(); +ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() { + return new AMDGPULowerExecSyncLegacy(); } -PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M, - ModuleAnalysisManager &AM) { - return runLowerSpecialLDS(M) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); +PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M, + ModuleAnalysisManager &AM) { + return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index a2fd53ac1b8ef..46d70c257b75e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,7 +29,7 @@ MODULE_PASS("amdgpu-perf-hint", MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this)) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) -MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass()) +MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass()) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) #undef MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 01495a3708ce3..e360a6db3ad78 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -567,7 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); - initializeAMDGPULowerSpecialLDSLegacyPass(*PR); + initializeAMDGPULowerExecSyncLegacyPass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c401926e22a87..0240315eb7066 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,7 +81,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp AMDGPUPrepareAGPRAlloc.cpp - AMDGPULowerSpecialLDS.cpp + AMDGPULowerExecSync.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll similarity index 97% rename from llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll rename to llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll index 28d94f3d42622..9ba9f41d30ffe 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } From e8eb89b1544eaf166bce2924e0190c2206bcf8b3 Mon Sep 17 00:00:00 2001 From: skc7 Date: Wed, 5 Nov 2025 21:35:12 +0530 Subject: [PATCH 05/16] Fix message for legacy pass --- llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp index 4b640362e8887..d7124de0a6fd6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -219,10 +219,10 @@ char AMDGPULowerExecSyncLegacy::ID = 0; char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of special LDS variables", false, false) + "AMDGPU lowering of execution synchronization globals", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of special LDS variables", false, false) + "AMDGPU lowering of execution synchronization globals", false, false) bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { return runLowerExecSyncGlobals(M); From 5fac212c00bfd21d2dcef0535d409e3f372df855 Mon Sep 17 00:00:00 2001 From: skc7 Date: Wed, 5 Nov 2025 22:00:14 +0530 Subject: [PATCH 06/16] Fix format issue --- llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp index d7124de0a6fd6..3343d594f47ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -219,10 +219,12 @@ char AMDGPULowerExecSyncLegacy::ID = 0; char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of execution synchronization globals", false, false) + "AMDGPU lowering of execution synchronization globals", + false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of execution synchronization globals", false, false) + "AMDGPU lowering of execution synchronization globals", + false, false) bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { return runLowerExecSyncGlobals(M); From 8277767d2bdc2c481c93c20097a1be794f037fdb Mon Sep 17 00:00:00 2001 From: skc7 Date: Sun, 9 Nov 2025 10:54:44 +0530 Subject: [PATCH 07/16] autogenerate test amdgpu-lower-exec-sync.ll --- .../CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll | 100 ++++++++++++------ 1 file changed, 68 insertions(+), 32 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll index 9ba9f41d30ffe..782d94845a358 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } @@ -6,45 +7,75 @@ @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison -; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0 -; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1 -; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2 -; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2 - +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +;. define void @func1() { - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) - call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) - call void @llvm.amdgcn.s.barrier.wait(i16 1) - ret void +; CHECK-LABEL: define void @func1() { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void } define void @func2() { - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) - call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) - call void @llvm.amdgcn.s.barrier.wait(i16 1) - ret void +; CHECK-LABEL: define void @func2() { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void } define amdgpu_kernel void @kernel1() #0 { -; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) - call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) - call void @llvm.amdgcn.s.barrier.wait(i16 1) - %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) - call void @llvm.amdgcn.s.barrier() - call void @func1() - call void @func2() - ret void +; CHECK-LABEL: define amdgpu_kernel void @kernel1( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: call void @func1() +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + ret void } define amdgpu_kernel void @kernel2() #0 { -; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) - call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) - call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-LABEL: define amdgpu_kernel void @kernel2( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) - call void @func2() - ret void + call void @func2() + ret void } declare void @llvm.amdgcn.s.barrier() #1 @@ -61,7 +92,12 @@ declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 attributes #0 = { nounwind } attributes #1 = { convergent nounwind } attributes #2 = { nounwind readnone } - -; CHECK: !0 = !{i32 8396816, i32 8396817} -; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913} -; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849} +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind } +;. +; CHECK: [[META0]] = !{i32 8396816, i32 8396817} +; CHECK: [[META1]] = !{i32 8396912, i32 8396913} +; CHECK: [[META2]] = !{i32 8396848, i32 8396849} +;. From b01131a4b991ba202fcd10d7d7c5699604826af4 Mon Sep 17 00:00:00 2001 From: skc7 Date: Thu, 30 Oct 2025 22:42:33 +0530 Subject: [PATCH 08/16] [AMDGPU] Enable amdgpu-lower-special-lds pass in pipeline --- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 126 ------------------ llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 6 + llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 3 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 14 ++ ...amdgpu-lower-special-lds-and-module-lds.ll | 119 +++++++++++++++++ .../amdgpu-lower-special-lds-and-sw-lds.ll | 86 ++++++++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 + .../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 2 +- 9 files changed, 236 insertions(+), 131 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a4ef524c43466..3c0328e93ffbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS { return KernelToCreatedDynamicLDS; } - static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, - Function *KF) { - bool NeedsReplacement = false; - for (Use &U : GV->uses()) { - if (auto *I = dyn_cast(U.getUser())) { - Function *F = I->getFunction(); - if (isKernelLDS(F) && F != KF) { - NeedsReplacement = true; - break; - } - } - } - if (!NeedsReplacement) - return GV; - // Create a new GV used only by this kernel and its function - GlobalVariable *NewGV = new GlobalVariable( - M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), - GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, - GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - NewGV->copyAttributesFrom(GV); - for (Use &U : make_early_inc_range(GV->uses())) { - if (auto *I = dyn_cast(U.getUser())) { - Function *F = I->getFunction(); - if (!isKernelLDS(F) || F == KF) { - U.getUser()->replaceUsesOfWith(GV, NewGV); - } - } - } - return NewGV; - } - - bool lowerSpecialLDSVariables( - Module &M, LDSUsesInfoTy &LDSUsesInfo, - VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { - bool Changed = false; - const DataLayout &DL = M.getDataLayout(); - // The 1st round: give module-absolute assignments - int NumAbsolutes = 0; - std::vector OrderedGVs; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - GlobalVariable *GV = K.first; - if (!isNamedBarrier(*GV)) - continue; - // give a module-absolute assignment if it is indirectly accessed by - // multiple kernels. This is not precise, but we don't want to duplicate - // a function when it is called by multiple kernels. - if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { - OrderedGVs.push_back(GV); - } else { - // leave it to the 2nd round, which will give a kernel-relative - // assignment if it is only indirectly accessed by one kernel - LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); - } - LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - NumAbsolutes += BarCnt; - - // 4 bits for alignment, 5 bits for the barrier num, - // 3 bits for the barrier scope - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, GV, Offset); - } - OrderedGVs.clear(); - - // The 2nd round: give a kernel-relative assignment for GV that - // either only indirectly accessed by single kernel or only directly - // accessed by multiple kernels. - std::vector OrderedKernels; - for (auto &K : LDSUsesInfo.direct_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - OrderedKernels.push_back(F); - } - OrderedKernels = sortByName(std::move(OrderedKernels)); - - llvm::DenseMap Kernel2BarId; - for (Function *F : OrderedKernels) { - for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { - if (!isNamedBarrier(*GV)) - continue; - - LDSUsesInfo.direct_access[F].erase(GV); - if (GV->isAbsoluteSymbolRef()) { - // already assigned - continue; - } - OrderedGVs.push_back(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - // GV could also be used directly by other kernels. If so, we need to - // create a new GV used only by this kernel and its function. - auto NewGV = uniquifyGVPerKernel(M, GV, F); - Changed |= (NewGV != GV); - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = Kernel2BarId[F]; - BarId += NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - Kernel2BarId[F] += BarCnt; - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, NewGV, Offset); - } - OrderedGVs.clear(); - } - // Also erase those special LDS variables from indirect_access. - for (auto &K : LDSUsesInfo.indirect_access) { - assert(isKernelLDS(K.first)); - for (GlobalVariable *GV : K.second) { - if (isNamedBarrier(*GV)) - K.second.erase(GV); - } - } - return Changed; - } - bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS { } } - if (LDSUsesInfo.HasSpecialGVs) { - // Special LDS variables need special address assignment - Changed |= lowerSpecialLDSVariables( - M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); - } - // Partition variables accessed indirectly into the different strategies DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index e17c2113ca398..f7dff4ba4c5e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -273,6 +273,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // this is a re-run of the pass // so we don't have anything to do. // - No variables are absolute. + // Named-barriers which are absolute symbols are removed + // from the maps. std::optional HasAbsoluteGVs; bool HasSpecialGVs = false; for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { @@ -284,6 +286,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { if (IsDirectMapDynLDSGV) continue; if (isNamedBarrier(*GV)) { + if (IsAbsolute) { + DirectMapKernel[Fn].erase(GV); + IndirectMapKernel[Fn].erase(GV); + } HasSpecialGVs = true; continue; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 4a9437b37aa39..827326ae90a75 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -293,7 +293,8 @@ void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) { if (!AMDGPU::isLDSVariableToLower(*GV)) continue; - + if (isNamedBarrier(*GV)) + continue; for (User *V : GV->users()) { if (auto *I = dyn_cast(V)) { Function *F = I->getFunction(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e360a6db3ad78..85b5775ce91af 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -465,6 +465,11 @@ static cl::opt EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt + EnableLowerSpecialLDS("amdgpu-enable-lower-special-lds", + cl::desc("Enable lowering of special lds pass."), + cl::init(true), cl::Hidden); + static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " @@ -963,6 +968,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. + if (EnableLowerSpecialLDS) + PM.addPass(AMDGPULowerSpecialLDSPass()); if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) @@ -1334,6 +1341,10 @@ void AMDGPUPassConfig::addIRPasses() { // Make enqueued block runtime handles externally visible. addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); + // Lower special LDS accesses. + if (EnableLowerSpecialLDS) + addPass(createAMDGPULowerSpecialLDSLegacyPass()); + // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); @@ -2081,6 +2092,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUExportKernelRuntimeHandlesPass()); + if (EnableLowerSpecialLDS) + addPass(AMDGPULowerSpecialLDSPass()); + if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll new file mode 100644 index 0000000000000..73cde6405ae1f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]] +; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +;. +define void @func1() #0 { +; CHECK-LABEL: define void @func1( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() #0 { +; CHECK-LABEL: define void @func2( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: call void @func1() +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + store i8 9, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel2( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @func2() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-lds-size"="1" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: [[META0]] = !{i32 8396816, i32 8396817} +; CHECK: [[META1]] = !{i32 8396912, i32 8396913} +; CHECK: [[META2]] = !{i32 8396848, i32 8396849} +; CHECK: [[META3]] = !{i32 0, i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll new file mode 100644 index 0000000000000..3127f1feac230 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll @@ -0,0 +1,86 @@ +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.barkernel = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +; +define void @foo() #0 { +; CHECK-LABEL: define void @foo( +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @bar() #0 { +; CHECK-LABEL: define void @bar( +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @fookernel() #0 { +; CHECK-LABEL: define amdgpu_kernel void @fookernel( +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) +; CHECK: call void @llvm.amdgcn.s.barrier() + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @foo() + call void @bar() + store i8 9, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @barkernel() #0 { +; CHECK-LABEL: define amdgpu_kernel void @barkernel( +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.barkernel, i32 9) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.barkernel) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @bar() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind sanitize_address } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 8e7389ace9c5c..69dfbc2a2ae51 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,11 +9,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index ee6caab6f25cd..7e5b9a22f0352 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,6 +44,7 @@ ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Externalize enqueued block runtime handles +; GCN-O0-NEXT: AMDGPU lowering of special LDS variables ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -197,6 +198,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Externalize enqueued block runtime handles +; GCN-O1-NEXT: AMDGPU lowering of special LDS variables ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -489,6 +491,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles +; GCN-O1-OPTS-NEXT: AMDGPU lowering of special LDS variables ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -810,6 +813,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Externalize enqueued block runtime handles +; GCN-O2-NEXT: AMDGPU lowering of special LDS variables ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1135,6 +1139,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Externalize enqueued block runtime handles +; GCN-O3-NEXT: AMDGPU lowering of special LDS variables ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll index 03a666fbe3aea..4fd728dfc9191 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } From 926080b283cd0843fa2a4fb7861b86fe105c6be9 Mon Sep 17 00:00:00 2001 From: skc7 Date: Mon, 3 Nov 2025 14:37:36 +0530 Subject: [PATCH 09/16] Fix tests --- llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 5 +- ...amdgpu-lower-special-lds-and-module-lds.ll | 3 + .../amdgpu-lower-special-lds-and-sw-lds.ll | 57 +++++++------------ 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 827326ae90a75..3591c3c335338 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -291,9 +291,8 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) { void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) { - if (!AMDGPU::isLDSVariableToLower(*GV)) - continue; - if (isNamedBarrier(*GV)) + // named-barrier globals are lowered by amdgpu-lower-special-lds pass. + if (!AMDGPU::isLDSVariableToLower(*GV) || isNamedBarrier(*GV)) continue; for (User *V : GV->users()) { if (auto *I = dyn_cast(V)) { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll index 73cde6405ae1f..1ddbaf8b5d94d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll @@ -2,6 +2,9 @@ ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s +; Test to ensure that special LDS variables like named barriers are lowered correctly, +; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-special-lds pass. + %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll index 3127f1feac230..a185249488cdb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll @@ -1,34 +1,25 @@ ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s +; Test to ensure that special LDS variables like named barriers are lowered correctly in asan scenario, +; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-special-lds pass. %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison -@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison @lds1 = internal addrspace(3) global [1 x i8] poison, align 4 ;. ; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] -; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] -; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] -; CHECK: @bar1.barkernel = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]] ; -define void @foo() #0 { -; CHECK-LABEL: define void @foo( -; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) -; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) -; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) - call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) - call void @llvm.amdgcn.s.barrier.wait(i16 1) - ret void -} - define void @bar() #0 { ; CHECK-LABEL: define void @bar( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) ; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) ; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: store i8 7, ptr addrspace(1) {{.*}}, align 4 +; call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) call void @llvm.amdgcn.s.barrier.wait(i16 1) @@ -36,29 +27,18 @@ define void @bar() #0 { ret void } -define amdgpu_kernel void @fookernel() #0 { -; CHECK-LABEL: define amdgpu_kernel void @fookernel( -; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) -; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) -; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) -; CHECK: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) -; CHECK: call void @llvm.amdgcn.s.barrier() - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) - call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) - call void @llvm.amdgcn.s.barrier.wait(i16 1) - %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) - call void @llvm.amdgcn.s.barrier() - call void @foo() - call void @bar() - store i8 9, ptr addrspace(3) @lds1, align 4 - ret void -} - define amdgpu_kernel void @barkernel() #0 { ; CHECK-LABEL: define amdgpu_kernel void @barkernel( -; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.barkernel, i32 9) -; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.barkernel) +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK: {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}}) +; CHECK: call void @llvm.amdgcn.s.barrier() +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) ; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: call void @bar() +; CHECK: store i8 10, ptr addrspace(1) {{.*}}, align 4 +; CHECK: call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}}) +; call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) call void @llvm.amdgcn.s.barrier.wait(i16 1) @@ -84,3 +64,10 @@ attributes #2 = { nounwind readnone } !llvm.module.flags = !{!0} !0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address } +; CHECK: attributes #[[ATTR1]] = { nounwind sanitize_address "amdgpu-lds-size"="8" } +;. +; CHECK: [[META0]] = !{i32 8396880, i32 8396881} +; CHECK: [[META1]] = !{i32 8396816, i32 8396817} +;. From 17791e8610f393c54e77d501f2c146374fa5bf90 Mon Sep 17 00:00:00 2001 From: skc7 Date: Wed, 5 Nov 2025 21:28:58 +0530 Subject: [PATCH 10/16] update names --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 18 +++++++++--------- ...> amdgpu-lower-exec-sync-and-module-lds.ll} | 6 +++--- ...ll => amdgpu-lower-exec-sync-and-sw-lds.ll} | 6 +++--- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 10 +++++----- llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll | 2 +- 5 files changed, 21 insertions(+), 21 deletions(-) rename llvm/test/CodeGen/AMDGPU/{amdgpu-lower-special-lds-and-module-lds.ll => amdgpu-lower-exec-sync-and-module-lds.ll} (96%) rename llvm/test/CodeGen/AMDGPU/{amdgpu-lower-special-lds-and-sw-lds.ll => amdgpu-lower-exec-sync-and-sw-lds.ll} (92%) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 85b5775ce91af..0890b82eb45ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -466,9 +466,9 @@ static cl::opt EnableScalarIRPasses( cl::Hidden); static cl::opt - EnableLowerSpecialLDS("amdgpu-enable-lower-special-lds", - cl::desc("Enable lowering of special lds pass."), - cl::init(true), cl::Hidden); + EnableLowerExecSync("amdgpu-enable-lower-exec-sync", + cl::desc("Enable lowering of exec sync pass."), + cl::init(true), cl::Hidden); static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", @@ -968,8 +968,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. - if (EnableLowerSpecialLDS) - PM.addPass(AMDGPULowerSpecialLDSPass()); + if (EnableLowerExecSync) + PM.addPass(AMDGPULowerExecSyncPass()); if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) @@ -1342,8 +1342,8 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); // Lower special LDS accesses. - if (EnableLowerSpecialLDS) - addPass(createAMDGPULowerSpecialLDSLegacyPass()); + if (EnableLowerExecSync) + addPass(createAMDGPULowerExecSyncLegacyPass()); // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) @@ -2092,8 +2092,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUExportKernelRuntimeHandlesPass()); - if (EnableLowerSpecialLDS) - addPass(AMDGPULowerSpecialLDSPass()); + if (EnableLowerExecSync) + addPass(AMDGPULowerExecSyncPass()); if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll similarity index 96% rename from llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll rename to llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll index 1ddbaf8b5d94d..bed8fa20a5044 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 -; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s -; Test to ensure that special LDS variables like named barriers are lowered correctly, -; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-special-lds pass. +; Test to ensure that LDS variables like named barriers are lowered correctly, +; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass. %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll similarity index 92% rename from llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll rename to llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll index a185249488cdb..05f2f07c84503 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds-and-sw-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll @@ -1,8 +1,8 @@ -; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s ; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s -; Test to ensure that special LDS variables like named barriers are lowered correctly in asan scenario, -; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-special-lds pass. +; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario, +; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass. %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 7e5b9a22f0352..a366e9508fc1c 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,7 +44,7 @@ ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Externalize enqueued block runtime handles -; GCN-O0-NEXT: AMDGPU lowering of special LDS variables +; GCN-O0-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -198,7 +198,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Externalize enqueued block runtime handles -; GCN-O1-NEXT: AMDGPU lowering of special LDS variables +; GCN-O1-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -491,7 +491,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles -; GCN-O1-OPTS-NEXT: AMDGPU lowering of special LDS variables +; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -813,7 +813,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Externalize enqueued block runtime handles -; GCN-O2-NEXT: AMDGPU lowering of special LDS variables +; GCN-O2-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1139,7 +1139,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Externalize enqueued block runtime handles -; GCN-O3-NEXT: AMDGPU lowering of special LDS variables +; GCN-O3-NEXT: AMDGPU lowering of execution synchronization globals ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll index 4fd728dfc9191..9f3dfb01282bc 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-special-lds,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } From 71d07c39a139da828c8fccd82af5ac60b83572f6 Mon Sep 17 00:00:00 2001 From: skc7 Date: Thu, 6 Nov 2025 12:50:33 +0530 Subject: [PATCH 11/16] remove changes from prior LDS lowerin passes --- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 126 ++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 4 +- 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 3c0328e93ffbd..a4ef524c43466 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -922,6 +922,126 @@ class AMDGPULowerModuleLDS { return KernelToCreatedDynamicLDS; } + static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, + Function *KF) { + bool NeedsReplacement = false; + for (Use &U : GV->uses()) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (isKernelLDS(F) && F != KF) { + NeedsReplacement = true; + break; + } + } + } + if (!NeedsReplacement) + return GV; + // Create a new GV used only by this kernel and its function + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + for (Use &U : make_early_inc_range(GV->uses())) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (!isKernelLDS(F) || F == KF) { + U.getUser()->replaceUsesOfWith(GV, NewGV); + } + } + } + return NewGV; + } + + bool lowerSpecialLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + // The 1st round: give module-absolute assignments + int NumAbsolutes = 0; + std::vector OrderedGVs; + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + GlobalVariable *GV = K.first; + if (!isNamedBarrier(*GV)) + continue; + // give a module-absolute assignment if it is indirectly accessed by + // multiple kernels. This is not precise, but we don't want to duplicate + // a function when it is called by multiple kernels. + if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { + OrderedGVs.push_back(GV); + } else { + // leave it to the 2nd round, which will give a kernel-relative + // assignment if it is only indirectly accessed by one kernel + LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); + } + LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + NumAbsolutes += BarCnt; + + // 4 bits for alignment, 5 bits for the barrier num, + // 3 bits for the barrier scope + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, GV, Offset); + } + OrderedGVs.clear(); + + // The 2nd round: give a kernel-relative assignment for GV that + // either only indirectly accessed by single kernel or only directly + // accessed by multiple kernels. + std::vector OrderedKernels; + for (auto &K : LDSUsesInfo.direct_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + OrderedKernels.push_back(F); + } + OrderedKernels = sortByName(std::move(OrderedKernels)); + + llvm::DenseMap Kernel2BarId; + for (Function *F : OrderedKernels) { + for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { + if (!isNamedBarrier(*GV)) + continue; + + LDSUsesInfo.direct_access[F].erase(GV); + if (GV->isAbsoluteSymbolRef()) { + // already assigned + continue; + } + OrderedGVs.push_back(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + // GV could also be used directly by other kernels. If so, we need to + // create a new GV used only by this kernel and its function. + auto NewGV = uniquifyGVPerKernel(M, GV, F); + Changed |= (NewGV != GV); + unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = Kernel2BarId[F]; + BarId += NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + Kernel2BarId[F] += BarCnt; + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, NewGV, Offset); + } + OrderedGVs.clear(); + } + // Also erase those special LDS variables from indirect_access. + for (auto &K : LDSUsesInfo.indirect_access) { + assert(isKernelLDS(K.first)); + for (GlobalVariable *GV : K.second) { + if (isNamedBarrier(*GV)) + K.second.erase(GV); + } + } + return Changed; + } + bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -944,6 +1064,12 @@ class AMDGPULowerModuleLDS { } } + if (LDSUsesInfo.HasSpecialGVs) { + // Special LDS variables need special address assignment + Changed |= lowerSpecialLDSVariables( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); + } + // Partition variables accessed indirectly into the different strategies DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 3591c3c335338..4a9437b37aa39 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -291,9 +291,9 @@ void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) { void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() { for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) { - // named-barrier globals are lowered by amdgpu-lower-special-lds pass. - if (!AMDGPU::isLDSVariableToLower(*GV) || isNamedBarrier(*GV)) + if (!AMDGPU::isLDSVariableToLower(*GV)) continue; + for (User *V : GV->users()) { if (auto *I = dyn_cast(V)) { Function *F = I->getFunction(); From 84594f2429f1c65fe4f28d6bb93f48d9309ddaef Mon Sep 17 00:00:00 2001 From: skc7 Date: Sun, 9 Nov 2025 10:59:11 +0530 Subject: [PATCH 12/16] Update amdgpu-lower-exec-sync.ll test with llc RUN line --- llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll index 782d94845a358..bde6db6463cb1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-exec-sync -mtriple=amdgcn-amd-amdhsa | FileCheck %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } From 862d70282db157c65b6a927c6128696c443ac946 Mon Sep 17 00:00:00 2001 From: skc7 Date: Thu, 6 Nov 2025 14:29:17 +0530 Subject: [PATCH 13/16] [AMDGPU] Remove lowering named-barrier LDS logci from amdgpu-lower-module-lds --- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 126 ------------------ 1 file changed, 126 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a4ef524c43466..3c0328e93ffbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS { return KernelToCreatedDynamicLDS; } - static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, - Function *KF) { - bool NeedsReplacement = false; - for (Use &U : GV->uses()) { - if (auto *I = dyn_cast(U.getUser())) { - Function *F = I->getFunction(); - if (isKernelLDS(F) && F != KF) { - NeedsReplacement = true; - break; - } - } - } - if (!NeedsReplacement) - return GV; - // Create a new GV used only by this kernel and its function - GlobalVariable *NewGV = new GlobalVariable( - M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), - GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, - GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - NewGV->copyAttributesFrom(GV); - for (Use &U : make_early_inc_range(GV->uses())) { - if (auto *I = dyn_cast(U.getUser())) { - Function *F = I->getFunction(); - if (!isKernelLDS(F) || F == KF) { - U.getUser()->replaceUsesOfWith(GV, NewGV); - } - } - } - return NewGV; - } - - bool lowerSpecialLDSVariables( - Module &M, LDSUsesInfoTy &LDSUsesInfo, - VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { - bool Changed = false; - const DataLayout &DL = M.getDataLayout(); - // The 1st round: give module-absolute assignments - int NumAbsolutes = 0; - std::vector OrderedGVs; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - GlobalVariable *GV = K.first; - if (!isNamedBarrier(*GV)) - continue; - // give a module-absolute assignment if it is indirectly accessed by - // multiple kernels. This is not precise, but we don't want to duplicate - // a function when it is called by multiple kernels. - if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { - OrderedGVs.push_back(GV); - } else { - // leave it to the 2nd round, which will give a kernel-relative - // assignment if it is only indirectly accessed by one kernel - LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); - } - LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - NumAbsolutes += BarCnt; - - // 4 bits for alignment, 5 bits for the barrier num, - // 3 bits for the barrier scope - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, GV, Offset); - } - OrderedGVs.clear(); - - // The 2nd round: give a kernel-relative assignment for GV that - // either only indirectly accessed by single kernel or only directly - // accessed by multiple kernels. - std::vector OrderedKernels; - for (auto &K : LDSUsesInfo.direct_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - OrderedKernels.push_back(F); - } - OrderedKernels = sortByName(std::move(OrderedKernels)); - - llvm::DenseMap Kernel2BarId; - for (Function *F : OrderedKernels) { - for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { - if (!isNamedBarrier(*GV)) - continue; - - LDSUsesInfo.direct_access[F].erase(GV); - if (GV->isAbsoluteSymbolRef()) { - // already assigned - continue; - } - OrderedGVs.push_back(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - // GV could also be used directly by other kernels. If so, we need to - // create a new GV used only by this kernel and its function. - auto NewGV = uniquifyGVPerKernel(M, GV, F); - Changed |= (NewGV != GV); - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = Kernel2BarId[F]; - BarId += NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - Kernel2BarId[F] += BarCnt; - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, NewGV, Offset); - } - OrderedGVs.clear(); - } - // Also erase those special LDS variables from indirect_access. - for (auto &K : LDSUsesInfo.indirect_access) { - assert(isKernelLDS(K.first)); - for (GlobalVariable *GV : K.second) { - if (isNamedBarrier(*GV)) - K.second.erase(GV); - } - } - return Changed; - } - bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS { } } - if (LDSUsesInfo.HasSpecialGVs) { - // Special LDS variables need special address assignment - Changed |= lowerSpecialLDSVariables( - M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); - } - // Partition variables accessed indirectly into the different strategies DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; From ce406c280bf3d73d27d39f9b361c6a92a0c7b02e Mon Sep 17 00:00:00 2001 From: skc7 Date: Wed, 12 Nov 2025 10:34:23 +0530 Subject: [PATCH 14/16] Update description of pass --- llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 4 ++-- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 ++++---- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp index 3343d594f47ad..f939eded29102 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -219,11 +219,11 @@ char AMDGPULowerExecSyncLegacy::ID = 0; char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of execution synchronization globals", + "AMDGPU lowering of execution synchronization primitives", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of execution synchronization globals", + "AMDGPU lowering of execution synchronization primitives", false, false) bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0890b82eb45ed..095e34b86b08f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -465,10 +465,10 @@ static cl::opt EnableScalarIRPasses( cl::init(true), cl::Hidden); -static cl::opt - EnableLowerExecSync("amdgpu-enable-lower-exec-sync", - cl::desc("Enable lowering of exec sync pass."), - cl::init(true), cl::Hidden); +static cl::opt EnableLowerExecSync( + "amdgpu-enable-lower-exec-sync", + cl::desc("Enable lowering of execution synchronization primitives."), + cl::init(true), cl::Hidden); static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index a366e9508fc1c..8f1335ab87c85 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,7 +44,7 @@ ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Externalize enqueued block runtime handles -; GCN-O0-NEXT: AMDGPU lowering of execution synchronization globals +; GCN-O0-NEXT: AMDGPU lowering of execution synchronization primitives ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -198,7 +198,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Externalize enqueued block runtime handles -; GCN-O1-NEXT: AMDGPU lowering of execution synchronization globals +; GCN-O1-NEXT: AMDGPU lowering of execution synchronization primitives ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -491,7 +491,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles -; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization globals +; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization primitives ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -813,7 +813,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Externalize enqueued block runtime handles -; GCN-O2-NEXT: AMDGPU lowering of execution synchronization globals +; GCN-O2-NEXT: AMDGPU lowering of execution synchronization primitives ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1139,7 +1139,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Externalize enqueued block runtime handles -; GCN-O3-NEXT: AMDGPU lowering of execution synchronization globals +; GCN-O3-NEXT: AMDGPU lowering of execution synchronization primitives ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager From 26199e238f014c7cbfc39e7a76a0374f6cb3109f Mon Sep 17 00:00:00 2001 From: skc7 Date: Wed, 12 Nov 2025 14:49:49 +0530 Subject: [PATCH 15/16] Use execution synchronization everywhere --- llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 12 ++++++------ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++-- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp index f939eded29102..2938592164f0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPULowerExecSync.cpp -----------------------------------------===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// AMDGPU Lower Execution Synchronization pass performs lowering of +// Lower Execution Synchronization pass performs lowering of // LDS global variables with target extension type "amdgpu.named.barrier" // that require specialized address assignment. It assigns a unique // barrier identifier to each named-barrier LDS variable and encodes @@ -219,12 +219,12 @@ char AMDGPULowerExecSyncLegacy::ID = 0; char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of execution synchronization primitives", - false, false) + "AMDGPU lowering of execution synchronization", false, + false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, - "AMDGPU lowering of execution synchronization primitives", - false, false) + "AMDGPU lowering of execution synchronization", false, + false) bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { return runLowerExecSyncGlobals(M); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 095e34b86b08f..5ff16e29bbbb1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -467,8 +467,8 @@ static cl::opt EnableScalarIRPasses( static cl::opt EnableLowerExecSync( "amdgpu-enable-lower-exec-sync", - cl::desc("Enable lowering of execution synchronization primitives."), - cl::init(true), cl::Hidden); + cl::desc("Enable lowering of execution synchronization."), cl::init(true), + cl::Hidden); static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 8f1335ab87c85..fe75b2b5bfcf5 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,7 +44,7 @@ ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Externalize enqueued block runtime handles -; GCN-O0-NEXT: AMDGPU lowering of execution synchronization primitives +; GCN-O0-NEXT: AMDGPU lowering of execution synchronization ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -198,7 +198,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Externalize enqueued block runtime handles -; GCN-O1-NEXT: AMDGPU lowering of execution synchronization primitives +; GCN-O1-NEXT: AMDGPU lowering of execution synchronization ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -491,7 +491,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles -; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization primitives +; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -813,7 +813,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Externalize enqueued block runtime handles -; GCN-O2-NEXT: AMDGPU lowering of execution synchronization primitives +; GCN-O2-NEXT: AMDGPU lowering of execution synchronization ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1139,7 +1139,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Externalize enqueued block runtime handles -; GCN-O3-NEXT: AMDGPU lowering of execution synchronization primitives +; GCN-O3-NEXT: AMDGPU lowering of execution synchronization ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager From d9979cfd1469aaf8f3af9e58dfa1488a155ef862 Mon Sep 17 00:00:00 2001 From: skc7 Date: Thu, 13 Nov 2025 21:01:47 +0530 Subject: [PATCH 16/16] Fix pass description --- llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp index 2938592164f0a..89f6b38df9d56 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -6,8 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Lower Execution Synchronization pass performs lowering of -// LDS global variables with target extension type "amdgpu.named.barrier" +// Lower LDS global variables with target extension type "amdgpu.named.barrier" // that require specialized address assignment. It assigns a unique // barrier identifier to each named-barrier LDS variable and encodes // this identifier within the !absolute_symbol metadata of that global.