diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 67042b700c047..5af2a2755cec3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -298,6 +298,15 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin { bool GlobalOpt; }; +void initializeAMDGPULowerExecSyncLegacyPass(PassRegistry &); +extern char &AMDGPULowerExecSyncLegacyPassID; +ModulePass *createAMDGPULowerExecSyncLegacyPass(); + +struct AMDGPULowerExecSyncPass : PassInfoMixin { + AMDGPULowerExecSyncPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &); extern char &AMDGPUSwLowerLDSLegacyPassID; ModulePass * diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp new file mode 100644 index 0000000000000..89f6b38df9d56 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp @@ -0,0 +1,240 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower LDS global variables with target extension type "amdgpu.named.barrier" +// that require specialized address assignment. It assigns a unique +// barrier identifier to each named-barrier LDS variable and encodes +// this identifier within the !absolute_symbol metadata of that global. +// This encoding ensures that subsequent LDS lowering passes can process these +// barriers correctly without conflicts. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMemoryUtils.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +#include + +#define DEBUG_TYPE "amdgpu-lower-exec-sync" + +using namespace llvm; +using namespace AMDGPU; + +namespace { + +// If GV is also used directly by other kernels, create a new GV +// used only by this kernel and its function. +static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, + Function *KF) { + bool NeedsReplacement = false; + for (Use &U : GV->uses()) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (isKernelLDS(F) && F != KF) { + NeedsReplacement = true; + break; + } + } + } + if (!NeedsReplacement) + return GV; + // Create a new GV used only by this kernel and its function + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + for (Use &U : make_early_inc_range(GV->uses())) { + if (auto *I = dyn_cast(U.getUser())) { + Function *F = I->getFunction(); + if (!isKernelLDS(F) || F == KF) { + U.getUser()->replaceUsesOfWith(GV, NewGV); + } + } + } + return NewGV; +} + +// Write the specified address into metadata where it can be retrieved by +// the assembler. Format is a half open range, [Address Address+1) +static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, + uint32_t Address) { + LLVMContext &Ctx = M->getContext(); + auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(Ctx, {MinC, MaxC})); +} + +template SmallVector sortByName(SmallVector &&V) { + sort(V, [](const auto *L, const auto *R) { + return L->getName() < R->getName(); + }); + return {std::move(V)}; +} + +// Main utility function for special LDS variables lowering. +static bool lowerExecSyncGlobalVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + // The 1st round: give module-absolute assignments + int NumAbsolutes = 0; + SmallVector OrderedGVs; + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + GlobalVariable *GV = K.first; + if (!isNamedBarrier(*GV)) + continue; + // give a module-absolute assignment if it is indirectly accessed by + // multiple kernels. This is not precise, but we don't want to duplicate + // a function when it is called by multiple kernels. + if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { + OrderedGVs.push_back(GV); + } else { + // leave it to the 2nd round, which will give a kernel-relative + // assignment if it is only indirectly accessed by one kernel + LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); + } + LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + NumAbsolutes += BarCnt; + + // 4 bits for alignment, 5 bits for the barrier num, + // 3 bits for the barrier scope + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, GV, Offset); + } + OrderedGVs.clear(); + + // The 2nd round: give a kernel-relative assignment for GV that + // either only indirectly accessed by single kernel or only directly + // accessed by multiple kernels. + SmallVector OrderedKernels; + for (auto &K : LDSUsesInfo.direct_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + OrderedKernels.push_back(F); + } + OrderedKernels = sortByName(std::move(OrderedKernels)); + + DenseMap Kernel2BarId; + for (Function *F : OrderedKernels) { + for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { + if (!isNamedBarrier(*GV)) + continue; + + LDSUsesInfo.direct_access[F].erase(GV); + if (GV->isAbsoluteSymbolRef()) { + // already assigned + continue; + } + OrderedGVs.push_back(GV); + } + OrderedGVs = sortByName(std::move(OrderedGVs)); + for (GlobalVariable *GV : OrderedGVs) { + // GV could also be used directly by other kernels. If so, we need to + // create a new GV used only by this kernel and its function. + auto NewGV = uniquifyGVPerKernel(M, GV, F); + Changed |= (NewGV != GV); + unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; + unsigned BarId = Kernel2BarId[F]; + BarId += NumAbsolutes + 1; + unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; + Kernel2BarId[F] += BarCnt; + unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; + recordLDSAbsoluteAddress(&M, NewGV, Offset); + } + OrderedGVs.clear(); + } + // Also erase those special LDS variables from indirect_access. + for (auto &K : LDSUsesInfo.indirect_access) { + assert(isKernelLDS(K.first)); + for (GlobalVariable *GV : K.second) { + if (isNamedBarrier(*GV)) + K.second.erase(GV); + } + } + return Changed; +} + +static bool runLowerExecSyncGlobals(Module &M) { + CallGraph CG = CallGraph(M); + bool Changed = false; + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + + // For each kernel, what variables does it access directly or through + // callees + LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + + // For each variable accessed through callees, which kernels access it + VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + for (GlobalVariable *GV : K.second) { + LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + } + } + + if (LDSUsesInfo.HasSpecialGVs) { + // Special LDS variables need special address assignment + Changed |= lowerExecSyncGlobalVariables( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); + } + return Changed; +} + +class AMDGPULowerExecSyncLegacy : public ModulePass { +public: + static char ID; + AMDGPULowerExecSyncLegacy() : ModulePass(ID) {} + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPULowerExecSyncLegacy::ID = 0; +char &llvm::AMDGPULowerExecSyncLegacyPassID = AMDGPULowerExecSyncLegacy::ID; + +INITIALIZE_PASS_BEGIN(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, + "AMDGPU lowering of execution synchronization", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerExecSyncLegacy, DEBUG_TYPE, + "AMDGPU lowering of execution synchronization", false, + false) + +bool AMDGPULowerExecSyncLegacy::runOnModule(Module &M) { + return runLowerExecSyncGlobals(M); +} + +ModulePass *llvm::createAMDGPULowerExecSyncLegacyPass() { + return new AMDGPULowerExecSyncLegacy(); +} + +PreservedAnalyses AMDGPULowerExecSyncPass::run(Module &M, + ModuleAnalysisManager &AM) { + return runLowerExecSyncGlobals(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index a4ef524c43466..3c0328e93ffbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -922,126 +922,6 @@ class AMDGPULowerModuleLDS { return KernelToCreatedDynamicLDS; } - static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV, - Function *KF) { - bool NeedsReplacement = false; - for (Use &U : GV->uses()) { - if (auto *I = dyn_cast(U.getUser())) { - Function *F = I->getFunction(); - if (isKernelLDS(F) && F != KF) { - NeedsReplacement = true; - break; - } - } - } - if (!NeedsReplacement) - return GV; - // Create a new GV used only by this kernel and its function - GlobalVariable *NewGV = new GlobalVariable( - M, GV->getValueType(), GV->isConstant(), GV->getLinkage(), - GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr, - GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - NewGV->copyAttributesFrom(GV); - for (Use &U : make_early_inc_range(GV->uses())) { - if (auto *I = dyn_cast(U.getUser())) { - Function *F = I->getFunction(); - if (!isKernelLDS(F) || F == KF) { - U.getUser()->replaceUsesOfWith(GV, NewGV); - } - } - } - return NewGV; - } - - bool lowerSpecialLDSVariables( - Module &M, LDSUsesInfoTy &LDSUsesInfo, - VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) { - bool Changed = false; - const DataLayout &DL = M.getDataLayout(); - // The 1st round: give module-absolute assignments - int NumAbsolutes = 0; - std::vector OrderedGVs; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - GlobalVariable *GV = K.first; - if (!isNamedBarrier(*GV)) - continue; - // give a module-absolute assignment if it is indirectly accessed by - // multiple kernels. This is not precise, but we don't want to duplicate - // a function when it is called by multiple kernels. - if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) { - OrderedGVs.push_back(GV); - } else { - // leave it to the 2nd round, which will give a kernel-relative - // assignment if it is only indirectly accessed by one kernel - LDSUsesInfo.direct_access[*K.second.begin()].insert(GV); - } - LDSToKernelsThatNeedToAccessItIndirectly.erase(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - NumAbsolutes += BarCnt; - - // 4 bits for alignment, 5 bits for the barrier num, - // 3 bits for the barrier scope - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, GV, Offset); - } - OrderedGVs.clear(); - - // The 2nd round: give a kernel-relative assignment for GV that - // either only indirectly accessed by single kernel or only directly - // accessed by multiple kernels. - std::vector OrderedKernels; - for (auto &K : LDSUsesInfo.direct_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - OrderedKernels.push_back(F); - } - OrderedKernels = sortByName(std::move(OrderedKernels)); - - llvm::DenseMap Kernel2BarId; - for (Function *F : OrderedKernels) { - for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) { - if (!isNamedBarrier(*GV)) - continue; - - LDSUsesInfo.direct_access[F].erase(GV); - if (GV->isAbsoluteSymbolRef()) { - // already assigned - continue; - } - OrderedGVs.push_back(GV); - } - OrderedGVs = sortByName(std::move(OrderedGVs)); - for (GlobalVariable *GV : OrderedGVs) { - // GV could also be used directly by other kernels. If so, we need to - // create a new GV used only by this kernel and its function. - auto NewGV = uniquifyGVPerKernel(M, GV, F); - Changed |= (NewGV != GV); - unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP; - unsigned BarId = Kernel2BarId[F]; - BarId += NumAbsolutes + 1; - unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16; - Kernel2BarId[F] += BarCnt; - unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4; - recordLDSAbsoluteAddress(&M, NewGV, Offset); - } - OrderedGVs.clear(); - } - // Also erase those special LDS variables from indirect_access. - for (auto &K : LDSUsesInfo.indirect_access) { - assert(isKernelLDS(K.first)); - for (GlobalVariable *GV : K.second) { - if (isNamedBarrier(*GV)) - K.second.erase(GV); - } - } - return Changed; - } - bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1064,12 +944,6 @@ class AMDGPULowerModuleLDS { } } - if (LDSUsesInfo.HasSpecialGVs) { - // Special LDS variables need special address assignment - Changed |= lowerSpecialLDSVariables( - M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly); - } - // Partition variables accessed indirectly into the different strategies DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index e17c2113ca398..f7dff4ba4c5e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -273,6 +273,8 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { // this is a re-run of the pass // so we don't have anything to do. // - No variables are absolute. + // Named-barriers which are absolute symbols are removed + // from the maps. std::optional HasAbsoluteGVs; bool HasSpecialGVs = false; for (auto &Map : {DirectMapKernel, IndirectMapKernel}) { @@ -284,6 +286,10 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { if (IsDirectMapDynLDSGV) continue; if (isNamedBarrier(*GV)) { + if (IsAbsolute) { + DirectMapKernel[Fn].erase(GV); + IndirectMapKernel[Fn].erase(GV); + } HasSpecialGVs = true; continue; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index bf6f1a9dbf576..46d70c257b75e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-perf-hint", MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this)) MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) +MODULE_PASS("amdgpu-lower-exec-sync", AMDGPULowerExecSyncPass()) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) #undef MODULE_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b87b54ffc4f12..5ff16e29bbbb1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -465,6 +465,11 @@ static cl::opt EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt EnableLowerExecSync( + "amdgpu-enable-lower-exec-sync", + cl::desc("Enable lowering of execution synchronization."), cl::init(true), + cl::Hidden); + static cl::opt EnableSwLowerLDS("amdgpu-enable-sw-lower-lds", cl::desc("Enable lowering of lds to global memory pass " @@ -567,6 +572,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); + initializeAMDGPULowerExecSyncLegacyPass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); @@ -962,6 +968,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. + if (EnableLowerExecSync) + PM.addPass(AMDGPULowerExecSyncPass()); if (EnableSwLowerLDS) PM.addPass(AMDGPUSwLowerLDSPass(*this)); if (EnableLowerModuleLDS) @@ -1333,6 +1341,10 @@ void AMDGPUPassConfig::addIRPasses() { // Make enqueued block runtime handles externally visible. addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass()); + // Lower special LDS accesses. + if (EnableLowerExecSync) + addPass(createAMDGPULowerExecSyncLegacyPass()); + // Lower LDS accesses to global memory pass if address sanitizer is enabled. if (EnableSwLowerLDS) addPass(createAMDGPUSwLowerLDSLegacyPass(&TM)); @@ -2080,6 +2092,9 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const { addPass(AMDGPUExportKernelRuntimeHandlesPass()); + if (EnableLowerExecSync) + addPass(AMDGPULowerExecSyncPass()); + if (EnableSwLowerLDS) addPass(AMDGPUSwLowerLDSPass(TM)); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index a1e0e5293c706..0240315eb7066 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp AMDGPUPrepareAGPRAlloc.cpp + AMDGPULowerExecSync.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll new file mode 100644 index 0000000000000..bed8fa20a5044 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-module-lds -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to ensure that LDS variables like named barriers are lowered correctly, +; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass. + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]] +; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +;. +define void @func1() #0 { +; CHECK-LABEL: define void @func1( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() #0 { +; CHECK-LABEL: define void @func2( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: call void @func1() +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + store i8 9, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel2( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4 +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @func2() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-lds-size"="1" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: [[META0]] = !{i32 8396816, i32 8396817} +; CHECK: [[META1]] = !{i32 8396912, i32 8396913} +; CHECK: [[META2]] = !{i32 8396848, i32 8396849} +; CHECK: [[META3]] = !{i32 0, i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll new file mode 100644 index 0000000000000..05f2f07c84503 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll @@ -0,0 +1,73 @@ +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario, +; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass. +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison +@lds1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]] +; +define void @bar() #0 { +; CHECK-LABEL: define void @bar( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: store i8 7, ptr addrspace(1) {{.*}}, align 4 +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + store i8 7, ptr addrspace(3) @lds1, align 4 + ret void +} + +define amdgpu_kernel void @barkernel() #0 { +; CHECK-LABEL: define amdgpu_kernel void @barkernel( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK: {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}}) +; CHECK: call void @llvm.amdgcn.s.barrier() +; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK: call void @bar() +; CHECK: store i8 10, ptr addrspace(1) {{.*}}, align 4 +; CHECK: call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}}) +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + call void @bar() + store i8 10, ptr addrspace(3) @lds1, align 4 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind sanitize_address } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address } +; CHECK: attributes #[[ATTR1]] = { nounwind sanitize_address "amdgpu-lds-size"="8" } +;. +; CHECK: [[META0]] = !{i32 8396880, i32 8396881} +; CHECK: [[META1]] = !{i32 8396816, i32 8396817} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll new file mode 100644 index 0000000000000..bde6db6463cb1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s +; RUN: llc < %s -enable-new-pm -stop-after=amdgpu-lower-exec-sync -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) } + +@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison +@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison +@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison + +;. +; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]] +; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]] +;. +define void @func1() { +; CHECK-LABEL: define void @func1() { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define void @func2() { +; CHECK-LABEL: define void @func2() { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + ret void +} + +define amdgpu_kernel void @kernel1() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel1( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: call void @func1() +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier() + call void @func1() + call void @func2() + ret void +} + +define amdgpu_kernel void @kernel2() #0 { +; CHECK-LABEL: define amdgpu_kernel void @kernel2( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1) +; CHECK-NEXT: call void @func2() +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9) + call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1) + call void @llvm.amdgcn.s.barrier.wait(i16 1) + + call void @func2() + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier.wait(i16) #1 +declare void @llvm.amdgcn.s.barrier.signal(i32) #1 +declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1 +declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1 +declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1 +declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1 +declare void @llvm.amdgcn.s.barrier.leave(i16) #1 +declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1 +declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind } +;. +; CHECK: [[META0]] = !{i32 8396816, i32 8396817} +; CHECK: [[META1]] = !{i32 8396912, i32 8396913} +; CHECK: [[META2]] = !{i32 8396848, i32 8396849} +;. diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 8e7389ace9c5c..69dfbc2a2ae51 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,11 +9,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require,require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index ee6caab6f25cd..fe75b2b5bfcf5 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,6 +44,7 @@ ; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O0-NEXT: Function Alias Analysis Results ; GCN-O0-NEXT: Externalize enqueued block runtime handles +; GCN-O0-NEXT: AMDGPU lowering of execution synchronization ; GCN-O0-NEXT: AMDGPU Software lowering of LDS ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager @@ -197,6 +198,7 @@ ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Externalize enqueued block runtime handles +; GCN-O1-NEXT: AMDGPU lowering of execution synchronization ; GCN-O1-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager @@ -489,6 +491,7 @@ ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles +; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization ; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -810,6 +813,7 @@ ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Externalize enqueued block runtime handles +; GCN-O2-NEXT: AMDGPU lowering of execution synchronization ; GCN-O2-NEXT: AMDGPU Software lowering of LDS ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1135,6 +1139,7 @@ ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Externalize enqueued block runtime handles +; GCN-O3-NEXT: AMDGPU lowering of execution synchronization ; GCN-O3-NEXT: AMDGPU Software lowering of LDS ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll index 03a666fbe3aea..9f3dfb01282bc 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync,amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }