-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals #165692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Chaitanya (skc7) ChangesThis PR introduces Changes include:
See #161827 for discussion on this topic. Full diff: https://github.com/llvm/llvm-project/pull/165692.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cd8b2495a4250..d878cbfce07f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
bool GlobalOpt;
};
+void initializeAMDGPULowerSpecialLDSLegacyPass(PassRegistry &);
+extern char &AMDGPULowerSpecialLDSLegacyPassID;
+ModulePass *createAMDGPULowerSpecialLDSLegacyPass();
+
+struct AMDGPULowerSpecialLDSPass : PassInfoMixin<AMDGPULowerSpecialLDSPass> {
+ AMDGPULowerSpecialLDSPass() {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
extern char &AMDGPUSwLowerLDSLegacyPassID;
ModulePass *
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
new file mode 100644
index 0000000000000..5534a3ba6382e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerSpecialLDS.cpp
@@ -0,0 +1,234 @@
+//===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the named barriers LDS globals which needs
+// special address assignment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "amdgpu-lower-special-lds"
+
+using namespace llvm;
+using namespace AMDGPU;
+
+namespace {
+
+// If GV is also used directly by other kernels, create a new GV
+// used only by this kernel and its function.
+static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernelLDS(F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+}
+
+// Write the specified address into metadata where it can be retrieved by
+// the assembler. Format is a half open range, [Address Address+1)
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+}
+
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+ llvm::sort(V, [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
+// Main utility function for special LDS variables lowering.
+static bool lowerSpecialLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ std::vector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ NumAbsolutes += BarCnt;
+
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ std::vector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned BarId = Kernel2BarId[F];
+ BarId += NumAbsolutes + 1;
+ unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
+ Kernel2BarId[F] += BarCnt;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ assert(isKernelLDS(K.first));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+}
+
+static bool runLowerSpecialLDS(Module &M) {
+ CallGraph CG = CallGraph(M);
+ bool Changed = false;
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ // For each kernel, what variables does it access directly or through
+ // callees
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // Special LDS variables need special address assignment
+ Changed |= lowerSpecialLDSVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+ return Changed;
+}
+
+class AMDGPULowerSpecialLDSLegacy : public ModulePass {
+public:
+ static char ID;
+ AMDGPULowerSpecialLDSLegacy() : ModulePass(ID) {}
+ bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerSpecialLDSLegacy::ID = 0;
+char &llvm::AMDGPULowerSpecialLDSLegacyPassID = AMDGPULowerSpecialLDSLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of special LDS variables", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerSpecialLDSLegacy, DEBUG_TYPE,
+ "AMDGPU lowering of special LDS variables", false, false)
+
+bool AMDGPULowerSpecialLDSLegacy::runOnModule(Module &M) {
+ return runLowerSpecialLDS(M);
+}
+
+ModulePass *llvm::createAMDGPULowerSpecialLDSLegacyPass() {
+ return new AMDGPULowerSpecialLDSLegacy();
+}
+
+PreservedAnalyses AMDGPULowerSpecialLDSPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ return runLowerSpecialLDS(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index bf6f1a9dbf576..a2fd53ac1b8ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-perf-hint",
MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-lower-special-lds", AMDGPULowerSpecialLDSPass())
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
#undef MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 75a94ac891819..916826ea169aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -567,6 +567,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerLegacyPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPULowerSpecialLDSLegacyPass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e5293c706..c401926e22a87 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
AMDGPUPrepareAGPRAlloc.cpp
+ AMDGPULowerSpecialLDS.cpp
AMDGPUSwLowerLDS.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
new file mode 100644
index 0000000000000..28d94f3d42622
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-special-lds.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-special-lds < %s 2>&1 | FileCheck %s
+
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+
+@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
+; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+
+define void @func1() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define void @func2() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier()
+ call void @func1()
+ call void @func2()
+ ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+ call void @func2()
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+; CHECK: !0 = !{i32 8396816, i32 8396817}
+; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
+; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see the changes that remove the logic from LowerModuleLDS? Are they missing?
EDIT: Ah I see, they're in the next PR that enables the pass.
Please update the PR description to reflect that
| @@ -0,0 +1,234 @@ | |||
| //===-- AMDGPULowerSpecialLDS.cpp -----------------------------------------===// | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe wait for other reviewers to give feedback first, but I think we should name this "LowerBarrierGVs" instead.
I really don't want to see more "special LDS variables" proliferate. Making the pass specific to the barrier GV would help to not encourage others from adding another "special LDS not not really LDS" GV.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree we don't want to say "special", but "barrier" is also not accurate because we will soon need to lower semaphores as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"amdgpu-lower-sync-lds" Could be used?
Sync here relates to synchronization primitives.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps "AMDGPULowerExecSync" ("lower execution synchronization") ?
I assume if we want to do other operations on barriers in the future, like add fallback lowerings, or fixup/optimize intrinsics, we'd do it there as well (because it's the path of least resistance, but also it makes sense to deal with all of it in one place instead of spreading it). I would still like to see the GV be removed from the LDS AS so I wouldn't make this LDS specific either.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"AMDGPULowerSyncPrimitives" sounds better ... "ExecSync" induces furious head scratching trying to understand what it could refer to.
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // This pass lowers the named barriers LDS globals which needs |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Elaborate a bit more, add a little code snippet with before/after. e.g.: This assigns a unique barrier ID to every !amdgpu.named.barrier GV, and encodes the barrier ID in the !absolute_symbol metadata of the GV so LDS lowering pass can safely handle it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated. Thanks for feedback.
| const DataLayout &DL = M.getDataLayout(); | ||
| // The 1st round: give module-absolute assignments | ||
| int NumAbsolutes = 0; | ||
| std::vector<GlobalVariable *> OrderedGVs; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SmallVector
| } | ||
| OrderedGVs = sortByName(std::move(OrderedGVs)); | ||
| for (GlobalVariable *GV : OrderedGVs) { | ||
| unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; | |
| unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
| // The 2nd round: give a kernel-relative assignment for GV that | ||
| // either only indirectly accessed by single kernel or only directly | ||
| // accessed by multiple kernels. | ||
| std::vector<Function *> OrderedKernels; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SmallVector
| // create a new GV used only by this kernel and its function. | ||
| auto NewGV = uniquifyGVPerKernel(M, GV, F); | ||
| Changed |= (NewGV != GV); | ||
| unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; | |
| unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Pierre-vh
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good to me but I'd like other reviewers to chime in before approving
| @@ -0,0 +1,67 @@ | |||
| ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s | |||
|
|
|||
| %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you autogenerate this test instead ? It's easier to maintain that way
Also perhaps worth adding another RUN like w/ the new pass manager?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated this test with auto-generates checks.
new-pm RUN line has been added in new patch in #166731
I don't really see why we need three separate commits for this. It's just existing code getting rebundled into a pass, and that pass is being enabled by default in its new intended place. All this should be done as a single change. This kind of split can hide complexity and errors. With a single change, if anything breaks, then either things were not as simple as they seem, or errors were introduced. But it will all be trackable to this one change. |
For any pass that will be newly introduced in the amdgpu backend, developers usually follow the approach of two PRs
Have followed the same for this change as well. Additionally, removal of changes from |
This is not a new pass. It's existing code bundled into a pass as a refactoring. I still think this should be a single change from the point of view of someone looking at the git log. All failures should be traceable to this one transaction. EDIT: On second thought, I am not sure I agree that such a two-step convention exists or needs to exist. Other than catching build failures on obscure platforms, it's as if step 1 does not exist until step 2 has happened. I don't know what is the benefit of that. |
|
@ssahasra Ideally we want to keep the change as small but "atomic" as possible, but I think the argument makes sense. I can live with that. |
ssahasra
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a couple of nit-picks. Just pick one name: lower-exec-sync or lower-sync-primitives, and then update it in all places like file name, class name, option name, option description, class description etc. I see the word "primitive" used in only descriptions but nowhere else.
| @@ -0,0 +1,241 @@ | |||
| //===-- AMDGPULowerExecSync.cpp -----------------------------------------===// | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove the file name. It adds no new information, and can become stale if the file is renamed.
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // AMDGPU Lower Execution Synchronization pass performs lowering of |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // AMDGPU Lower Execution Synchronization pass performs lowering of | |
| // Lower |
ce7449e to
26199e2
Compare
Used "execution synchronization" everywhere after recent changes. Thanks. |
Pierre-vh
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM but I'd like another person to approve as well before this lands
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Lower Execution Synchronization pass performs lowering of |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Lower Execution Synchronization pass performs lowering of | |
| // Lower |
That rest of the line is entirely unnecessary. Just "Lower LDS global variables with ..." works.
ssahasra
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Just one nitpick, but not very important.
| bool NeedsReplacement = false; | ||
| for (Use &U : GV->uses()) { | ||
| if (auto *I = dyn_cast<Instruction>(U.getUser())) { | ||
| Function *F = I->getFunction(); | ||
| if (isKernelLDS(F) && F != KF) { | ||
| NeedsReplacement = true; | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| if (!NeedsReplacement) | ||
| return GV; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can rewrite this without the temporary variable and break. Should also be over users, and could be any_of
|
|
||
| template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) { | ||
| sort(V, [](const auto *L, const auto *R) { | ||
| return L->getName() < R->getName(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will need tie breakers for anonymous
| // either only indirectly accessed by single kernel or only directly | ||
| // accessed by multiple kernels. | ||
| SmallVector<Function *> OrderedKernels; | ||
| for (auto &K : LDSUsesInfo.direct_access) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
structure binding, probably no &?
| assert(isKernelLDS(K.first)); | ||
| for (GlobalVariable *GV : K.second) { | ||
| if (isNamedBarrier(*GV)) | ||
| K.second.erase(GV); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably not safe to do this erase inside the range loop over the set
| OrderedGVs.clear(); | ||
| } | ||
| // Also erase those special LDS variables from indirect_access. | ||
| for (auto &K : LDSUsesInfo.indirect_access) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No auto
| if (!isNamedBarrier(*GV)) | ||
| continue; | ||
|
|
||
| LDSUsesInfo.direct_access[F].erase(GV); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid repeated map lookups
|
FWIW, @arsenm, you are commenting on the quality of code that was previously submitted with just a cursory review. These issues can be addressed separately and don't need to block this change. |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/202/builds/4220 Here is the relevant piece of the build log for the reference |
This picks up the following fix: * llvm/llvm-project#165692
This picks up the following fix: * llvm/llvm-project#165692
This picks up the following fix: * llvm/llvm-project#165692
This PR introduces
amdgpu-lower-exec-syncpass which specifically lowers named-barrier LDS globals introduced by #114550 .Changes include:
Moving the logic of lowering named-barrier LDS globals from
amdgpu-lower-module-ldspass to this new pass.This PR adds the pass, remove the existing lowering logic for named-barrier LDS in
amdgpu-lower-module-ldsSee #161827 for discussion on this topic.