Skip to content

Commit 49d5bb0

Browse files
authored
[AMDGPU] Add amdgpu-lower-exec-sync pass to lower named-barrier globals (#165692)
This PR introduces `amdgpu-lower-exec-sync` pass which specifically lowers named-barrier LDS globals introduced by #114550 . Changes include: - Moving the logic of lowering named-barrier LDS globals from `amdgpu-lower-module-lds` pass to this new pass. - This PR adds the pass to pipeline, remove the existing lowering logic for named-barrier LDS in `amdgpu-lower-module-lds` See #161827 for discussion on this topic.
1 parent d5cdfd4 commit 49d5bb0

13 files changed

+580
-130
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
298298
bool GlobalOpt;
299299
};
300300

301+
void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &);
302+
extern char &AMDGPULowerExecSyncLegacyPassID;
303+
ModulePass *createAMDGPULowerExecSyncLegacyPass();
304+
305+
struct AMDGPULowerExecSyncPass : PassInfoMixin<AMDGPULowerExecSyncPass> {
306+
AMDGPULowerExecSyncPass() {}
307+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
308+
};
309+
301310
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &);
302311
extern char &AMDGPUSwLowerLDSLegacyPassID;
303312
ModulePass *
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Lower LDS global variables with target extension type "amdgpu.named.barrier"
10+
// that require specialized address assignment. It assigns a unique
11+
// barrier identifier to each named-barrier LDS variable and encodes
12+
// this identifier within the !absolute_symbol metadata of that global.
13+
// This encoding ensures that subsequent LDS lowering passes can process these
14+
// barriers correctly without conflicts.
15+
//
16+
//===----------------------------------------------------------------------===//
17+
18+
#include "AMDGPU.h"
19+
#include "AMDGPUMemoryUtils.h"
20+
#include "AMDGPUTargetMachine.h"
21+
#include "llvm/ADT/DenseMap.h"
22+
#include "llvm/Analysis/CallGraph.h"
23+
#include "llvm/CodeGen/TargetPassConfig.h"
24+
#include "llvm/IR/Constants.h"
25+
#include "llvm/IR/Instructions.h"
26+
#include "llvm/IR/ReplaceConstant.h"
27+
#include "llvm/InitializePasses.h"
28+
#include "llvm/Pass.h"
29+
30+
#include <algorithm>
31+
32+
#define DEBUG_TYPE "amdgpu-lower-exec-sync"
33+
34+
using namespace llvm;
35+
using namespace AMDGPU;
36+
37+
namespace {
38+
39+
// If GV is also used directly by other kernels, create a new GV
40+
// used only by this kernel and its function.
41+
static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
42+
Function *KF) {
43+
bool NeedsReplacement = false;
44+
for (Use &U : GV->uses()) {
45+
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
46+
Function *F = I->getFunction();
47+
if (isKernelLDS(F) && F != KF) {
48+
NeedsReplacement = true;
49+
break;
50+
}
51+
}
52+
}
53+
if (!NeedsReplacement)
54+
return GV;
55+
// Create a new GV used only by this kernel and its function
56+
GlobalVariable *NewGV = new GlobalVariable(
57+
M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
58+
GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
59+
GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
60+
NewGV->copyAttributesFrom(GV);
61+
for (Use &U : make_early_inc_range(GV->uses())) {
62+
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
63+
Function *F = I->getFunction();
64+
if (!isKernelLDS(F) || F == KF) {
65+
U.getUser()->replaceUsesOfWith(GV, NewGV);
66+
}
67+
}
68+
}
69+
return NewGV;
70+
}
71+
72+
// Write the specified address into metadata where it can be retrieved by
73+
// the assembler. Format is a half open range, [Address Address+1)
74+
static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
75+
uint32_t Address) {
76+
LLVMContext &Ctx = M->getContext();
77+
auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
78+
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
79+
auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
80+
GV->setMetadata(LLVMContext::MD_absolute_symbol,
81+
MDNode::get(Ctx, {MinC, MaxC}));
82+
}
83+
84+
template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
85+
sort(V, [](const auto *L, const auto *R) {
86+
return L->getName() < R->getName();
87+
});
88+
return {std::move(V)};
89+
}
90+
91+
// Main utility function for special LDS variables lowering.
92+
static bool lowerExecSyncGlobalVariables(
93+
Module &M, LDSUsesInfoTy &LDSUsesInfo,
94+
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
95+
bool Changed = false;
96+
const DataLayout &DL = M.getDataLayout();
97+
// The 1st round: give module-absolute assignments
98+
int NumAbsolutes = 0;
99+
SmallVector<GlobalVariable *> OrderedGVs;
100+
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
101+
GlobalVariable *GV = K.first;
102+
if (!isNamedBarrier(*GV))
103+
continue;
104+
// give a module-absolute assignment if it is indirectly accessed by
105+
// multiple kernels. This is not precise, but we don't want to duplicate
106+
// a function when it is called by multiple kernels.
107+
if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
108+
OrderedGVs.push_back(GV);
109+
} else {
110+
// leave it to the 2nd round, which will give a kernel-relative
111+
// assignment if it is only indirectly accessed by one kernel
112+
LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
113+
}
114+
LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
115+
}
116+
OrderedGVs = sortByName(std::move(OrderedGVs));
117+
for (GlobalVariable *GV : OrderedGVs) {
118+
unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
119+
unsigned BarId = NumAbsolutes + 1;
120+
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
121+
NumAbsolutes += BarCnt;
122+
123+
// 4 bits for alignment, 5 bits for the barrier num,
124+
// 3 bits for the barrier scope
125+
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
126+
recordLDSAbsoluteAddress(&M, GV, Offset);
127+
}
128+
OrderedGVs.clear();
129+
130+
// The 2nd round: give a kernel-relative assignment for GV that
131+
// either only indirectly accessed by single kernel or only directly
132+
// accessed by multiple kernels.
133+
SmallVector<Function *> OrderedKernels;
134+
for (auto &K : LDSUsesInfo.direct_access) {
135+
Function *F = K.first;
136+
assert(isKernelLDS(F));
137+
OrderedKernels.push_back(F);
138+
}
139+
OrderedKernels = sortByName(std::move(OrderedKernels));
140+
141+
DenseMap<Function *, uint32_t> Kernel2BarId;
142+
for (Function *F : OrderedKernels) {
143+
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
144+
if (!isNamedBarrier(*GV))
145+
continue;
146+
147+
LDSUsesInfo.direct_access[F].erase(GV);
148+
if (GV->isAbsoluteSymbolRef()) {
149+
// already assigned
150+
continue;
151+
}
152+
OrderedGVs.push_back(GV);
153+
}
154+
OrderedGVs = sortByName(std::move(OrderedGVs));
155+
for (GlobalVariable *GV : OrderedGVs) {
156+
// GV could also be used directly by other kernels. If so, we need to
157+
// create a new GV used only by this kernel and its function.
158+
auto NewGV = uniquifyGVPerKernel(M, GV, F);
159+
Changed |= (NewGV != GV);
160+
unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
161+
unsigned BarId = Kernel2BarId[F];
162+
BarId += NumAbsolutes + 1;
163+
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
164+
Kernel2BarId[F] += BarCnt;
165+
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
166+
recordLDSAbsoluteAddress(&M, NewGV, Offset);
167+
}
168+
OrderedGVs.clear();
169+
}
170+
// Also erase those special LDS variables from indirect_access.
171+
for (auto &K : LDSUsesInfo.indirect_access) {
172+
assert(isKernelLDS(K.first));
173+
for (GlobalVariable *GV : K.second) {
174+
if (isNamedBarrier(*GV))
175+
K.second.erase(GV);
176+
}
177+
}
178+
return Changed;
179+
}
180+
181+
static bool runLowerExecSyncGlobals(Module &M) {
182+
CallGraph CG = CallGraph(M);
183+
bool Changed = false;
184+
Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
185+
186+
// For each kernel, what variables does it access directly or through
187+
// callees
188+
LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
189+
190+
// For each variable accessed through callees, which kernels access it
191+
VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
192+
for (auto &K : LDSUsesInfo.indirect_access) {
193+
Function *F = K.first;
194+
assert(isKernelLDS(F));
195+
for (GlobalVariable *GV : K.second) {
196+
LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
197+
}
198+
}
199+
200+
if (LDSUsesInfo.HasSpecialGVs) {
201+
// Special LDS variables need special address assignment
202+
Changed |= lowerExecSyncGlobalVariables(
203+
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
204+
}
205+
return Changed;
206+
}
207+
208+
class AMDGPULowerExecSyncLegacy : public ModulePass {
209+
public:
210+
static char ID;
211+
AMDGPULowerExecSyncLegacy() : ModulePass(ID) {}
212+
bool runOnModule(Module &M) override;
213+
};
214+
215+
} // namespace
216+
217+
char AMDGPULowerExecSyncLegacy::ID = 0;
218+
char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID;
219+
220+
INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
221+
"AMDGPU lowering of execution synchronization", false,
222+
false)
223+
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
224+
INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE,
225+
"AMDGPU lowering of execution synchronization", false,
226+
false)
227+
228+
bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) {
229+
return runLowerExecSyncGlobals(M);
230+
}
231+
232+
ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() {
233+
return new AMDGPULowerExecSyncLegacy();
234+
}
235+
236+
PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M,
237+
ModuleAnalysisManager &AM) {
238+
return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none()
239+
: PreservedAnalyses::all();
240+
}

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Lines changed: 0 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS {
922922
return KernelToCreatedDynamicLDS;
923923
}
924924

925-
static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
926-
Function *KF) {
927-
bool NeedsReplacement = false;
928-
for (Use &U : GV->uses()) {
929-
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
930-
Function *F = I->getFunction();
931-
if (isKernelLDS(F) && F != KF) {
932-
NeedsReplacement = true;
933-
break;
934-
}
935-
}
936-
}
937-
if (!NeedsReplacement)
938-
return GV;
939-
// Create a new GV used only by this kernel and its function
940-
GlobalVariable *NewGV = new GlobalVariable(
941-
M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
942-
GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
943-
GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
944-
NewGV->copyAttributesFrom(GV);
945-
for (Use &U : make_early_inc_range(GV->uses())) {
946-
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
947-
Function *F = I->getFunction();
948-
if (!isKernelLDS(F) || F == KF) {
949-
U.getUser()->replaceUsesOfWith(GV, NewGV);
950-
}
951-
}
952-
}
953-
return NewGV;
954-
}
955-
956-
bool lowerSpecialLDSVariables(
957-
Module &M, LDSUsesInfoTy &LDSUsesInfo,
958-
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
959-
bool Changed = false;
960-
const DataLayout &DL = M.getDataLayout();
961-
// The 1st round: give module-absolute assignments
962-
int NumAbsolutes = 0;
963-
std::vector<GlobalVariable *> OrderedGVs;
964-
for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
965-
GlobalVariable *GV = K.first;
966-
if (!isNamedBarrier(*GV))
967-
continue;
968-
// give a module-absolute assignment if it is indirectly accessed by
969-
// multiple kernels. This is not precise, but we don't want to duplicate
970-
// a function when it is called by multiple kernels.
971-
if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
972-
OrderedGVs.push_back(GV);
973-
} else {
974-
// leave it to the 2nd round, which will give a kernel-relative
975-
// assignment if it is only indirectly accessed by one kernel
976-
LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
977-
}
978-
LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
979-
}
980-
OrderedGVs = sortByName(std::move(OrderedGVs));
981-
for (GlobalVariable *GV : OrderedGVs) {
982-
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
983-
unsigned BarId = NumAbsolutes + 1;
984-
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
985-
NumAbsolutes += BarCnt;
986-
987-
// 4 bits for alignment, 5 bits for the barrier num,
988-
// 3 bits for the barrier scope
989-
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
990-
recordLDSAbsoluteAddress(&M, GV, Offset);
991-
}
992-
OrderedGVs.clear();
993-
994-
// The 2nd round: give a kernel-relative assignment for GV that
995-
// either only indirectly accessed by single kernel or only directly
996-
// accessed by multiple kernels.
997-
std::vector<Function *> OrderedKernels;
998-
for (auto &K : LDSUsesInfo.direct_access) {
999-
Function *F = K.first;
1000-
assert(isKernelLDS(F));
1001-
OrderedKernels.push_back(F);
1002-
}
1003-
OrderedKernels = sortByName(std::move(OrderedKernels));
1004-
1005-
llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
1006-
for (Function *F : OrderedKernels) {
1007-
for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
1008-
if (!isNamedBarrier(*GV))
1009-
continue;
1010-
1011-
LDSUsesInfo.direct_access[F].erase(GV);
1012-
if (GV->isAbsoluteSymbolRef()) {
1013-
// already assigned
1014-
continue;
1015-
}
1016-
OrderedGVs.push_back(GV);
1017-
}
1018-
OrderedGVs = sortByName(std::move(OrderedGVs));
1019-
for (GlobalVariable *GV : OrderedGVs) {
1020-
// GV could also be used directly by other kernels. If so, we need to
1021-
// create a new GV used only by this kernel and its function.
1022-
auto NewGV = uniquifyGVPerKernel(M, GV, F);
1023-
Changed |= (NewGV != GV);
1024-
unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
1025-
unsigned BarId = Kernel2BarId[F];
1026-
BarId += NumAbsolutes + 1;
1027-
unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1028-
Kernel2BarId[F] += BarCnt;
1029-
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
1030-
recordLDSAbsoluteAddress(&M, NewGV, Offset);
1031-
}
1032-
OrderedGVs.clear();
1033-
}
1034-
// Also erase those special LDS variables from indirect_access.
1035-
for (auto &K : LDSUsesInfo.indirect_access) {
1036-
assert(isKernelLDS(K.first));
1037-
for (GlobalVariable *GV : K.second) {
1038-
if (isNamedBarrier(*GV))
1039-
K.second.erase(GV);
1040-
}
1041-
}
1042-
return Changed;
1043-
}
1044-
1045925
bool runOnModule(Module &M) {
1046926
CallGraph CG = CallGraph(M);
1047927
bool Changed = superAlignLDSGlobals(M);
@@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS {
1064944
}
1065945
}
1066946

1067-
if (LDSUsesInfo.HasSpecialGVs) {
1068-
// Special LDS variables need special address assignment
1069-
Changed |= lowerSpecialLDSVariables(
1070-
M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
1071-
}
1072-
1073947
// Partition variables accessed indirectly into the different strategies
1074948
DenseSet<GlobalVariable *> ModuleScopeVariables;
1075949
DenseSet<GlobalVariable *> TableLookupVariables;

0 commit comments

Comments
 (0)