From 0e647db27581991071d6e69abe977206806c5548 Mon Sep 17 00:00:00 2001 From: alex-t Date: Wed, 23 Apr 2025 22:13:15 +0200 Subject: [PATCH 1/4] [AMDGPU] Automatic conversion from wave32 to wave64 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 + llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp | 321 ++++++++++++++++++ llvm/lib/Target/AMDGPU/SIConvertWaveSize.h | 30 ++ .../AMDGPU/wave32-to-64-auto-convert.ll | 121 +++++++ 7 files changed, 480 insertions(+) create mode 100644 llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp create mode 100644 llvm/lib/Target/AMDGPU/SIConvertWaveSize.h create mode 100644 llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4ff761ec19b3c..76ef87ba44913 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -51,6 +51,7 @@ FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsLegacyPass(); FunctionPass *createSIFormMemoryClausesLegacyPass(); +FunctionPass *createSIConvertWaveSizeLegacyPass(const TargetMachine *); FunctionPass *createSIPostRABundlerPass(); FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *); @@ -174,6 +175,9 @@ extern char &SIShrinkInstructionsLegacyID; void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &); extern char &SIFixSGPRCopiesLegacyID; +void initializeSIConvertWaveSizeLegacyPass(PassRegistry &); +extern char &SIConvertWaveSizeLegacyID; + void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &); extern char &SIFixVGPRCopiesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 98a1147ef6d66..0cbd3ef8da761 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast(this))) +FUNCTION_PASS("si-convert-wave-size", SIConvertWaveSizePass(*static_cast(this))) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b6cc5137d711a..5be1640fd3db6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -44,6 +44,7 @@ #include "R600TargetMachine.h" #include "SIFixSGPRCopies.h" #include "SIFixVGPRCopies.h" +#include "SIConvertWaveSize.h" #include "SIFoldOperands.h" #include "SIFormMemoryClauses.h" #include "SILoadStoreOptimizer.h" @@ -506,6 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILowerSGPRSpillsLegacyPass(*PR); initializeSIFixSGPRCopiesLegacyPass(*PR); initializeSIFixVGPRCopiesLegacyPass(*PR); + initializeSIConvertWaveSizeLegacyPass(*PR); initializeSIFoldOperandsLegacyPass(*PR); initializeSIPeepholeSDWALegacyPass(*PR); initializeSIShrinkInstructionsLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..663361face090 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -150,6 +150,7 @@ add_llvm_target(AMDGPUCodeGen SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp SIFixVGPRCopies.cpp + SIConvertWaveSize.cpp SIFoldOperands.cpp SIFormMemoryClauses.cpp SIFrameLowering.cpp diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp new file mode 100644 index 0000000000000..4f5b839000c77 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp @@ -0,0 +1,321 @@ +//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64 +//---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +// Small short living kernels may become waveslot limited. +// To work around the problem an optimization is proposed to convert such +// kernels from wave32 to wave64 automatically.These kernels shall conform to a +// strict set of limitations and satisfy profitability conditions. +// +// 1. A kernel shall have no function calls as we cannot analyze call stack +// requirements (nor will it fall into a category of short living kernels +// anyway). +// 2. A kernel itself shall not be called from a device enqueue call. +// 3. A kernel shall not attempt to access EXEC or VCC in any user visible +// way. +// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP +// operations in general. +// 5. A kernel shall not read wavefront size or use ballot through +// intrinsics (a use of pre-defined frontend wave size macro was deemed +// permissible for now). +// 6. There shall be no atomic operations of any sort as these may be used +// for cross-thread communication. +// 7. There shall be no LDS access as the allocation is usually tied to the +// workgroup size and we generally cannot extend it. It is also changing +// occupancy which is tied to the wave size. +// 8. There shall be no inline asm calls. +// 9 .There shall be no dynamic VGPRs. +// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and +// transpose loads on GFX12+) work differently (have different operands) in +// wave32 and wave64. The kernel shall not have intrinsics to invoke such +// instructions. + +#include "SIConvertWaveSize.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-convert-wave-size" + +namespace { +class SIConvertWaveSize { + const TargetMachine *TM; + const LoopInfo *LI; + ScalarEvolution *SE; + TargetTransformInfo *TTI; + + InstructionCost TotalCost = 0; + + static const unsigned MaxLatency = 2000; + + SmallVector Callees; + +public: + SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI, + ScalarEvolution *SE, TargetTransformInfo *TTI) + : TM(TM), LI(LI), SE(SE), TTI(TTI) {} + + bool run(Function &F); + + bool changeWaveSizeAttr(Function *F); +}; + +class SIConvertWaveSizeLegacy : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; + SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {} + bool runOnFunction(Function &F) override { + auto &LI = getAnalysis().getLoopInfo(); + auto &SE = getAnalysis().getSE(); + auto &TTI = getAnalysis().getTTI(F); + SIConvertWaveSize Impl(TM, &LI, &SE, &TTI); + return Impl.run(F); + } + StringRef getPassName() const override { return "SI convert wave size"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + FunctionPass::getAnalysisUsage(AU); + } +}; +} // end anonymous namespace + +void printFunctionAttributes(const Function &F) { + LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n"); + for (const auto &Attr : F.getAttributes()) { + LLVM_DEBUG(dbgs() << " Attribute: " << Attr.getAsString() << "\n"); + } +} + +bool SIConvertWaveSize::run(Function &F) { + LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n"); + LLVM_DEBUG(printFunctionAttributes(F)); + + const GCNSubtarget &ST = TM->getSubtarget(F); + if (ST.getGeneration() < AMDGPUSubtarget::GFX11) + return false; + + // Check if the function is a kernel. + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) + return false; + + // Check if the kernel is wave32 + if (F.hasFnAttribute("target-features")) { + if (!F.getFnAttribute("target-features") + .getValueAsString().contains("wavefrontsize32")) { + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n"); + return false; + } + } + + // Check if the function is a device enqueue call. + if (F.hasFnAttribute("amdgpu-device-enqueue")) { + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n"); + return false; + } + + // Check if a trip count is a compile time constant for all loops in the + // kernel + for (Loop *L : *LI) { + const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L); + if (!isa(TripCountSCEV)) { + LLVM_DEBUG( + dbgs() << "SIConvertWaveSize: Trip count is not a compile time " + "constant.\n"); + return false; + } + } + + for (const auto &BB : F) { + InstructionCost BlockCost = 0; + for (const auto &I : BB) { + if (const CallBase *CB = dyn_cast(&I)) { + // FIXME: Any calls are not allowed. Only non-converged intrinsic clls + // and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd + // separately for debug purposes. This will be changed in the final + // version. + if (CB->isInlineAsm()) { + // Inline assembly is not allowed. + LLVM_DEBUG(dbgs() + << "SIConvertWaveSize: Inline assembly detected.\n"); + return false; + } + if (CB->isAtomic()) { + // Atomic operations are not allowed. + LLVM_DEBUG(dbgs() + << "SIConvertWaveSize: Atomic operation detected.\n"); + return false; + } + if (Function *Callee = CB->getCalledFunction()) { + // assuming readlane/readfirstlane or any cross-lane/DPP + // operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td + if (Callee->isIntrinsic()) { + if (Callee->hasFnAttribute(Attribute::Convergent)) { + if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) { + // TODO: what else should go in a "white list" ? + // Intrinsic::amdgcn_s_barrier_wavefront ? + // Intrinsic::amdgcn_s_barrier_signal ? + LLVM_DEBUG(dbgs() + << "SIConvertWaveSize: Convergent intrinsic " + << Callee->getName() << " detected.\n"); + return false; + } + } + + if (Callee->getIntrinsicID() == Intrinsic::read_register) { + if (const auto *MDVal = + dyn_cast(CB->getArgOperand(0))) { + Metadata *MD = MDVal->getMetadata(); + if (auto *MDNodeVal = dyn_cast(MD)) { + if (MDNodeVal->getNumOperands() >= 1) { + if (auto *MDStr = + dyn_cast(MDNodeVal->getOperand(0))) { + if (MDStr->getString().starts_with("exec") || + MDStr->getString().starts_with("vcc")) { + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register(" + << MDStr->getString() + << ") intrinsic detected.\n"); + return false; + } + } + } + } + } + } + + // Save callee as a candidate for attribute change + Callees.push_back(Callee); + } + } else { + // General calls are not allowed. + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n"); + return false; + } + } + // No LDS access is allowed + if (auto LI = dyn_cast(&I)) { + if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n"); + return false; + } + } + if (auto SI = dyn_cast(&I)) { + if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n"); + return false; + } + } + // TODO: All atomics are not allowed? + // if (auto AI = dyn_cast(&I)) { + // if (AI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access + // detected.\n"); return false; + // } + // } + + // TODO: Dynamic VGPRS and GFX11+ special operations ??? + BlockCost += + TTI->getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput); + } + if (auto L = LI->getLoopFor(&BB)) { + const SCEV *TripCount = SE->getBackedgeTakenCount(L); + if (auto *C = dyn_cast(TripCount)) { + uint64_t TC = C->getValue()->getZExtValue() + 1; + size_t Depth = LI->getLoopDepth(&BB); + BlockCost *= TC * Depth; + } else + llvm_unreachable("SIConvertWaveSize: only loops with compile time " + "constant trip count could reach here!\n"); + } + TotalCost += BlockCost; + if (TotalCost.isValid()) { + if (TotalCost.getValue().value() >= MaxLatency) { + LLVM_DEBUG( + dbgs() << "SIConvertWaveSize: Total latency of the kernel [" + << TotalCost.getValue().value() + << "] exceeds the limit of 2000 cycles - not profitable!\n"); + return false; + } + } else + llvm_unreachable( + "SIConvertWaveSize: Cost model error - invalid state!\n"); + } + + // Additional checks can be added here... + + // If all checks pass, convert wave size from wave32 to wave64. + // Conversion logic goes here... + bool Changed = changeWaveSizeAttr(&F); + if (Changed) + // Now take care of the intrinsic calls + for (auto C : Callees) { + // TODO: if we could not change Attr for one of the callee + // we need to rollback all the changes! + changeWaveSizeAttr(C); + } + + return Changed; + } + +bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) { + auto Attr = F->getFnAttribute("target-features"); + if (Attr.isValid()) { + StringRef AttrStr = Attr.getValueAsString(); + size_t Pos = AttrStr.find("+wavefrontsize32"); + if (Pos != StringRef::npos) { + // Remove the "+wavefrontsize32" attribute. + std::string NewBegin = AttrStr.substr(0, Pos).str().append("+wavefrontsize64"); + std::string End = AttrStr.substr(Pos + strlen("+wavefrontsize32")).str(); + std::string NewAttrStr = NewBegin + End; + // Add the "+wavefrontsize64" attribute. + F->removeFnAttr("target-features"); + F->addFnAttr("target-features", NewAttrStr); + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Converted wave size for " + << F->getName() + << " from wave32 " + "to wave64.\n"); + return true; + } + } + return false; +} + +INITIALIZE_PASS_BEGIN(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size", + false, false) + +char SIConvertWaveSizeLegacy::ID = 0; + +char &llvm::SIConvertWaveSizeLegacyID = SIConvertWaveSizeLegacy::ID; + +FunctionPass *llvm::createSIConvertWaveSizeLegacyPass(const TargetMachine *TM) { + return new SIConvertWaveSizeLegacy(TM); +} + +PreservedAnalyses SIConvertWaveSizePass::run( + Function &F, FunctionAnalysisManager &FAM) { + auto &LI = FAM.getResult(F); + auto &SE = FAM.getResult(F); + auto &TTI = FAM.getResult(F); + + SIConvertWaveSize Impl(TM, &LI, &SE, &TTI); + bool Changed = Impl.run(F); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h new file mode 100644 index 0000000000000..78b8365ed9ebc --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h @@ -0,0 +1,30 @@ +//===- SIConvertWaveSize.h ----------------------------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H +#define LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H + +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class SIConvertWaveSizePass : public PassInfoMixin { + /// The target machine. + const TargetMachine *TM; + +public: + SIConvertWaveSizePass(const TargetMachine &TM) + : TM(&TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H diff --git a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll new file mode 100644 index 0000000000000..d90e524e9cc2e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll @@ -0,0 +1,121 @@ +; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=si-convert-wave-size < %s | FileCheck %s + +define amdgpu_kernel void @test_not_wave32(ptr addrspace(1) %out) #0 { + ; CHECK: @test_not_wave32{{.*}}) #0 + %gep = getelementptr i32, ptr addrspace(1) %out, i32 2 + %tmp = load i32, ptr addrspace(1) %gep + store i32 %tmp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @intr_non_convergent(ptr addrspace(1) nocapture %arg) #1 { + ; CHECK: @intr_non_convergent{{.*}} #0 +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @intr_convergent(ptr addrspace(1) nocapture %arg, i32 %X) #1 { + ; CHECK: @intr_convergent{{.*}}) #1 +bb: + %tmp = icmp ugt i32 %X, 32 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %tmp) + store i32 %ballot, ptr addrspace(1) %arg + ret void +} + +define amdgpu_kernel void @test_barrier(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 { + ; CHECK: @test_barrier{{.*}}) #0 +entry: + %val = load <2 x half>, ptr addrspace(1) %in + call void @llvm.amdgcn.s.barrier() #2 + store <2 x half> %val, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @test_read_exec(ptr addrspace(1) %out) #1 { + ; CHECK: @test_read_exec{{.*}}) #1 + %exec = call i64 @llvm.read_register.i64(metadata !0) + store i64 %exec, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_read_vcc_lo(ptr addrspace(1) %out) #1 { + ; CHECK: @test_read_vcc_lo{{.*}}) #1 + %vcc_lo = call i32 @llvm.read_register.i32(metadata !1) + store i32 %vcc_lo, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_read_vcc_hi(ptr addrspace(1) %out) #1 { + ; CHECK: @test_read_vcc_hi{{.*}}) #1 + %vcc_hi = call i32 @llvm.read_register.i32(metadata !2) + store i32 %vcc_hi, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_lds_access(ptr addrspace(3) %out) #1 { + ; CHECK: @test_lds_access{{.*}}) #1 + %gep = getelementptr i32, ptr addrspace(3) %out, i32 2 + %tmp = load i32, ptr addrspace(3) %gep + store i32 %tmp, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @test_simple_loop(ptr addrspace(1) nocapture %arg) #1 { + ; CHECK: @test_simple_loop{{.*}}) #1 +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +define amdgpu_kernel void @test_nested_loop(ptr addrspace(1) nocapture %arg) #1 { + ; CHECK: @test_nested_loop{{.*}}) #1 +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 8 + br label %bb3 + +bb3: + %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ] + %tmp5 = add nuw nsw i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 128 + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp6, label %bb4, label %bb3 + +bb4: + br i1 %tmp3, label %bb1, label %bb2 +} + +declare void @llvm.amdgcn.s.sleep(i32) +declare i32 @llvm.amdgcn.wavefrontsize() +declare i32 @llvm.amdgcn.ballot.i32(i1) +declare i32 @llvm.read_register.i32(metadata) +declare i64 @llvm.read_register.i64(metadata) + +attributes #0 = { nounwind "target-features"="+wavefrontsize64" } +attributes #1 = { nounwind "target-features"="+wavefrontsize32" } + +!0 = !{!"exec"} +!1 = !{!"vcc_lo"} +!2 = !{!"vcc_hi"} From 4bc81331860618d77926a599a5d973eda2ece1a9 Mon Sep 17 00:00:00 2001 From: alex-t Date: Wed, 7 May 2025 21:01:21 +0200 Subject: [PATCH 2/4] [AMDGPU] Automatic conversion from wave32 to wave64. Review issues addressed. --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +- ...WaveSize.cpp => AMDGPUConvertWaveSize.cpp} | 256 ++++++++++++------ ...vertWaveSize.h => AMDGPUConvertWaveSize.h} | 13 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 2 +- 6 files changed, 183 insertions(+), 98 deletions(-) rename llvm/lib/Target/AMDGPU/{SIConvertWaveSize.cpp => AMDGPUConvertWaveSize.cpp} (50%) rename llvm/lib/Target/AMDGPU/{SIConvertWaveSize.h => AMDGPUConvertWaveSize.h} (65%) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 76ef87ba44913..7e585d2698564 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -51,7 +51,9 @@ FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsLegacyPass(); FunctionPass *createSIFormMemoryClausesLegacyPass(); -FunctionPass *createSIConvertWaveSizeLegacyPass(const TargetMachine *); +FunctionPass *createAMDGPUConvertWaveSizeLegacyPass(const GCNTargetMachine *); +void initializeAMDGPUConvertWaveSizeLegacyPass(PassRegistry &); +extern char &AMDGPUConvertWaveSizeLegacyID; FunctionPass *createSIPostRABundlerPass(); FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *); diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp similarity index 50% rename from llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp rename to llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp index 4f5b839000c77..c166def577558 100644 --- a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp @@ -1,11 +1,12 @@ -//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64 -//---------===// +//===- SIConvertWaveSize.cpp ----------------------------------------------===// +// +// Automatically converts wave32 kernels to wave64 // // Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -//===----------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// // /// \file // Small short living kernels may become waveslot limited. @@ -36,7 +37,7 @@ // wave32 and wave64. The kernel shall not have intrinsics to invoke such // instructions. -#include "SIConvertWaveSize.h" +#include "AMDGPUConvertWaveSize.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -49,8 +50,8 @@ using namespace llvm; #define DEBUG_TYPE "si-convert-wave-size" namespace { -class SIConvertWaveSize { - const TargetMachine *TM; +class AMDGPUConvertWaveSize { + const GCNTargetMachine *TM; const LoopInfo *LI; ScalarEvolution *SE; TargetTransformInfo *TTI; @@ -62,8 +63,8 @@ class SIConvertWaveSize { SmallVector Callees; public: - SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI, - ScalarEvolution *SE, TargetTransformInfo *TTI) + AMDGPUConvertWaveSize(const GCNTargetMachine *TM, const LoopInfo *LI, + ScalarEvolution *SE, TargetTransformInfo *TTI) : TM(TM), LI(LI), SE(SE), TTI(TTI) {} bool run(Function &F); @@ -71,20 +72,20 @@ class SIConvertWaveSize { bool changeWaveSizeAttr(Function *F); }; -class SIConvertWaveSizeLegacy : public FunctionPass { - const TargetMachine *TM; +class AMDGPUConvertWaveSizeLegacy : public FunctionPass { + const GCNTargetMachine *TM; public: static char ID; - SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {} + AMDGPUConvertWaveSizeLegacy(const GCNTargetMachine *TM) : FunctionPass(ID), TM(TM) {} bool runOnFunction(Function &F) override { auto &LI = getAnalysis().getLoopInfo(); auto &SE = getAnalysis().getSE(); auto &TTI = getAnalysis().getTTI(F); - SIConvertWaveSize Impl(TM, &LI, &SE, &TTI); + AMDGPUConvertWaveSize Impl(TM, &LI, &SE, &TTI); return Impl.run(F); } - StringRef getPassName() const override { return "SI convert wave size"; } + StringRef getPassName() const override { return "AMDGPU convert wave size"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); @@ -101,30 +102,44 @@ void printFunctionAttributes(const Function &F) { } } -bool SIConvertWaveSize::run(Function &F) { - LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n"); - LLVM_DEBUG(printFunctionAttributes(F)); - - const GCNSubtarget &ST = TM->getSubtarget(F); - if (ST.getGeneration() < AMDGPUSubtarget::GFX11) - return false; +bool AMDGPUConvertWaveSize::run(Function &F) { // Check if the function is a kernel. if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) return false; - // Check if the kernel is wave32 - if (F.hasFnAttribute("target-features")) { - if (!F.getFnAttribute("target-features") - .getValueAsString().contains("wavefrontsize32")) { - LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n"); + const GCNSubtarget &ST = TM->getSubtarget(F); + if (!ST.isWave32()) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel is not wave32.\n"); + return false; + } + + for (const auto &Arg : F.args()) { + if (Arg.getType()->isPointerTy() && + Arg.getType()->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel argument " << Arg + << " points to LDS object\n"); return false; } } - // Check if the function is a device enqueue call. - if (F.hasFnAttribute("amdgpu-device-enqueue")) { - LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n"); + // Check if the function can be called via device enqueue. + bool addressEscapes = false; + if (!F.use_empty()) { + const Module *M = F.getParent(); + for (const GlobalVariable &GV : M->globals()) { + if (GV.hasInitializer()) { + if (const Constant *Init = GV.getInitializer()) { + if (isa(Init) && Init == &F) { + addressEscapes = true; + } + } + } + } + } + + if (addressEscapes) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel address is taken.\n"); return false; } @@ -134,7 +149,7 @@ bool SIConvertWaveSize::run(Function &F) { const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L); if (!isa(TripCountSCEV)) { LLVM_DEBUG( - dbgs() << "SIConvertWaveSize: Trip count is not a compile time " + dbgs() << "AMDGPUConvertWaveSize: Trip count is not a compile time " "constant.\n"); return false; } @@ -143,23 +158,25 @@ bool SIConvertWaveSize::run(Function &F) { for (const auto &BB : F) { InstructionCost BlockCost = 0; for (const auto &I : BB) { + + // Atomic operations are not allowed. + if (I.isAtomic()) { + LLVM_DEBUG( + dbgs() << "AMDGPUConvertWaveSize: Atomic operation detected.\n"); + return false; + } + if (const CallBase *CB = dyn_cast(&I)) { - // FIXME: Any calls are not allowed. Only non-converged intrinsic clls - // and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd - // separately for debug purposes. This will be changed in the final - // version. + // FIXME: Any calls are not allowed. Only non-converged intrinsic calls + // and amdgsn_s_barrier are exempt. InlineAsm is checked separately + // for debug purposes. This will be changed in the final version. if (CB->isInlineAsm()) { // Inline assembly is not allowed. LLVM_DEBUG(dbgs() - << "SIConvertWaveSize: Inline assembly detected.\n"); - return false; - } - if (CB->isAtomic()) { - // Atomic operations are not allowed. - LLVM_DEBUG(dbgs() - << "SIConvertWaveSize: Atomic operation detected.\n"); + << "AMDGPUConvertWaveSize: Inline assembly detected.\n"); return false; } + if (Function *Callee = CB->getCalledFunction()) { // assuming readlane/readfirstlane or any cross-lane/DPP // operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td @@ -170,66 +187,131 @@ bool SIConvertWaveSize::run(Function &F) { // Intrinsic::amdgcn_s_barrier_wavefront ? // Intrinsic::amdgcn_s_barrier_signal ? LLVM_DEBUG(dbgs() - << "SIConvertWaveSize: Convergent intrinsic " + << "AMDGPUConvertWaveSize: Convergent intrinsic " << Callee->getName() << " detected.\n"); return false; } } - if (Callee->getIntrinsicID() == Intrinsic::read_register) { - if (const auto *MDVal = - dyn_cast(CB->getArgOperand(0))) { - Metadata *MD = MDVal->getMetadata(); - if (auto *MDNodeVal = dyn_cast(MD)) { - if (MDNodeVal->getNumOperands() >= 1) { - if (auto *MDStr = - dyn_cast(MDNodeVal->getOperand(0))) { - if (MDStr->getString().starts_with("exec") || - MDStr->getString().starts_with("vcc")) { - LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register(" - << MDStr->getString() - << ") intrinsic detected.\n"); - return false; - } - } - } + if (Callee->getIntrinsicID() == Intrinsic::read_register || + Callee->getIntrinsicID() == Intrinsic::write_register) { + + LLVM_DEBUG(dbgs() + << "AMDGPUConvertWaveSize: read/write_register " + "intrinsic detected.\n"); + return false; + } + + // Take care of LDS access + if (const auto *MTI = dyn_cast(&I)) { + auto DstAS = MTI->getDestAddressSpace(); + auto SrcAS = MTI->getSourceAddressSpace(); + if (DstAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access " + "(llvm.memcpy/memmove) detected.\n"); + return false; + } + } else if (const auto *MSI = dyn_cast(&I)) { + auto DstAS = MSI->getDestAddressSpace(); + if (DstAS == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access " + "(llvm.memset) detected.\n"); + return false; + } + } else if (const auto AMCI = dyn_cast(&I)) { + auto DstAS = AMCI->getDestAddressSpace(); + auto SrcAS = AMCI->getSourceAddressSpace(); + if (DstAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG( + dbgs() + << "AMDGPUConvertWaveSize: LDS access " + "(llvm.memcpy.element.unordered.atomic) detected\n"); + return false; + } + } else + if (const auto AMMI = dyn_cast(&I)) { + auto DstAS = AMMI->getDestAddressSpace(); + auto SrcAS = AMMI->getSourceAddressSpace(); + if (DstAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG( + dbgs() + << "AMDGPUConvertWaveSize: LDS access " + "(llvm.memmove.element.unordered.atomic) detected.\n"); + return false; + } + } else if (const auto *AMSI = dyn_cast(&I)) { + auto DstAS = AMSI->getDestAddressSpace(); + if (DstAS == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG( + dbgs() + << "AMDGPUConvertWaveSize: LDS access " + "(llvm.memset.element.unordered.atomic) detected.\n"); + return false; } } - } // Save callee as a candidate for attribute change Callees.push_back(Callee); } } else { // General calls are not allowed. - LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: function call detected.\n"); return false; } } // No LDS access is allowed - if (auto LI = dyn_cast(&I)) { + + // We already ensured we have no LDS pointers passed as arguments. + // Now take care of those cast from flat or global + + // Bail out early, before we come across the LDS addres use. + if (const auto AC = dyn_cast(&I)) { + if (AC->getDestTy()->getPointerAddressSpace() == + AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); + return false; + } + } + + if (const auto I2P = dyn_cast(&I)) { + if (I2P->getDestTy()->isPointerTy() && + I2P->getDestTy()->getPointerAddressSpace() == + AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); + return false; + } + } + + // GEP may refer to the global LDS object + if (const auto GEP = dyn_cast(&I)) { + if (GEP->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); + return false; + } + } + + // Load/Store/Atomics may directly use global LDS object + if (const auto LI = dyn_cast(&I)) { if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); return false; } } - if (auto SI = dyn_cast(&I)) { + if (const auto SI = dyn_cast(&I)) { if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); return false; } } - // TODO: All atomics are not allowed? - // if (auto AI = dyn_cast(&I)) { - // if (AI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - // LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access - // detected.\n"); return false; - // } - // } + + if (const auto MemIntr = dyn_cast(&I)) // TODO: Dynamic VGPRS and GFX11+ special operations ??? BlockCost += - TTI->getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput); + TTI->getInstructionCost(&I, TargetTransformInfo::TCK_Latency); } if (auto L = LI->getLoopFor(&BB)) { const SCEV *TripCount = SE->getBackedgeTakenCount(L); @@ -238,21 +320,21 @@ bool SIConvertWaveSize::run(Function &F) { size_t Depth = LI->getLoopDepth(&BB); BlockCost *= TC * Depth; } else - llvm_unreachable("SIConvertWaveSize: only loops with compile time " + llvm_unreachable("AMDGPUConvertWaveSize: only loops with compile time " "constant trip count could reach here!\n"); } TotalCost += BlockCost; if (TotalCost.isValid()) { if (TotalCost.getValue().value() >= MaxLatency) { LLVM_DEBUG( - dbgs() << "SIConvertWaveSize: Total latency of the kernel [" + dbgs() << "AMDGPUConvertWaveSize: Total latency of the kernel [" << TotalCost.getValue().value() << "] exceeds the limit of 2000 cycles - not profitable!\n"); return false; } } else llvm_unreachable( - "SIConvertWaveSize: Cost model error - invalid state!\n"); + "AMDGPUConvertWaveSize: Cost model error - invalid state!\n"); } // Additional checks can be added here... @@ -269,9 +351,9 @@ bool SIConvertWaveSize::run(Function &F) { } return Changed; - } +} -bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) { +bool AMDGPUConvertWaveSize::changeWaveSizeAttr(Function *F) { auto Attr = F->getFnAttribute("target-features"); if (Attr.isValid()) { StringRef AttrStr = Attr.getValueAsString(); @@ -284,7 +366,7 @@ bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) { // Add the "+wavefrontsize64" attribute. F->removeFnAttr("target-features"); F->addFnAttr("target-features", NewAttrStr); - LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Converted wave size for " + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for " << F->getName() << " from wave32 " "to wave64.\n"); @@ -294,28 +376,28 @@ bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) { return false; } -INITIALIZE_PASS_BEGIN(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size", +INITIALIZE_PASS_BEGIN(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size", +INITIALIZE_PASS_END(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size", false, false) -char SIConvertWaveSizeLegacy::ID = 0; +char AMDGPUConvertWaveSizeLegacy::ID = 0; -char &llvm::SIConvertWaveSizeLegacyID = SIConvertWaveSizeLegacy::ID; +char &llvm::AMDGPUConvertWaveSizeLegacyID = AMDGPUConvertWaveSizeLegacy::ID; -FunctionPass *llvm::createSIConvertWaveSizeLegacyPass(const TargetMachine *TM) { - return new SIConvertWaveSizeLegacy(TM); +FunctionPass *llvm::createAMDGPUConvertWaveSizeLegacyPass(const GCNTargetMachine *TM) { + return new AMDGPUConvertWaveSizeLegacy(TM); } -PreservedAnalyses SIConvertWaveSizePass::run( +PreservedAnalyses AMDGPUConvertWaveSizePass::run( Function &F, FunctionAnalysisManager &FAM) { - auto &LI = FAM.getResult(F); + auto &LI = FAM.getResult(F); auto &SE = FAM.getResult(F); auto &TTI = FAM.getResult(F); - SIConvertWaveSize Impl(TM, &LI, &SE, &TTI); + AMDGPUConvertWaveSize Impl(TM, &LI, &SE, &TTI); bool Changed = Impl.run(F); return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h similarity index 65% rename from llvm/lib/Target/AMDGPU/SIConvertWaveSize.h rename to llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h index 78b8365ed9ebc..e5b8c92c0b656 100644 --- a/llvm/lib/Target/AMDGPU/SIConvertWaveSize.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.h @@ -5,9 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H -#define LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -15,16 +16,16 @@ namespace llvm { -class SIConvertWaveSizePass : public PassInfoMixin { +class AMDGPUConvertWaveSizePass : public PassInfoMixin { /// The target machine. - const TargetMachine *TM; + const GCNTargetMachine *TM; public: - SIConvertWaveSizePass(const TargetMachine &TM) + AMDGPUConvertWaveSizePass(const GCNTargetMachine &TM) : TM(&TM) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); }; } // namespace llvm -#endif // LLVM_LIB_TARGET_AMDGPU_SICONVERTWAVESIZE_H +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCONVERTWAVESIZE_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 0cbd3ef8da761..b953ba8e77599 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -67,7 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast(this))) -FUNCTION_PASS("si-convert-wave-size", SIConvertWaveSizePass(*static_cast(this))) +FUNCTION_PASS("amdgpu-convert-wave-size", AMDGPUConvertWaveSizePass(*static_cast(this))) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5be1640fd3db6..f2e7adebf2786 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -44,7 +44,7 @@ #include "R600TargetMachine.h" #include "SIFixSGPRCopies.h" #include "SIFixVGPRCopies.h" -#include "SIConvertWaveSize.h" +#include "AMDGPUConvertWaveSize.h" #include "SIFoldOperands.h" #include "SIFormMemoryClauses.h" #include "SILoadStoreOptimizer.h" @@ -507,7 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILowerSGPRSpillsLegacyPass(*PR); initializeSIFixSGPRCopiesLegacyPass(*PR); initializeSIFixVGPRCopiesLegacyPass(*PR); - initializeSIConvertWaveSizeLegacyPass(*PR); + initializeAMDGPUConvertWaveSizeLegacyPass(*PR); initializeSIFoldOperandsLegacyPass(*PR); initializeSIPeepholeSDWALegacyPass(*PR); initializeSIShrinkInstructionsLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 663361face090..fccdd47151593 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -150,7 +150,7 @@ add_llvm_target(AMDGPUCodeGen SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp SIFixVGPRCopies.cpp - SIConvertWaveSize.cpp + AMDGPUConvertWaveSize.cpp SIFoldOperands.cpp SIFormMemoryClauses.cpp SIFrameLowering.cpp From beebaa2c99f088f50b7c5129997e46daf028ae3c Mon Sep 17 00:00:00 2001 From: alex-t Date: Wed, 7 May 2025 21:26:29 +0200 Subject: [PATCH 3/4] [AMDGPU] Automatic conversion from wave32 to wave64. Initialize method name fixed --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 7e585d2698564..06aa8c4ad06b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -177,8 +177,8 @@ extern char &SIShrinkInstructionsLegacyID; void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &); extern char &SIFixSGPRCopiesLegacyID; -void initializeSIConvertWaveSizeLegacyPass(PassRegistry &); -extern char &SIConvertWaveSizeLegacyID; +void initializeAMDGPUConvertWaveSizeLegacyPass(PassRegistry &); +extern char &AMDGPUConvertWaveSizeLegacyID; void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &); extern char &SIFixVGPRCopiesID; From 04a4147bbb97166c805f48c5bf2faac7648def3f Mon Sep 17 00:00:00 2001 From: alex-t Date: Thu, 8 May 2025 22:05:39 +0200 Subject: [PATCH 4/4] [AMDGPU] Automatic conversion from wave32 to wave64. -- Simplified wavefrontsize attribute update -- LDS access checks changed -- new LDS test in LIT NOTE: because of the simplified attribute update need to change all the checks. Most likely the direct checks of attributes is now impossible. --- .../Target/AMDGPU/AMDGPUConvertWaveSize.cpp | 159 ++++-------------- .../AMDGPU/wave32-to-64-auto-convert.ll | 29 +++- 2 files changed, 62 insertions(+), 126 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp index c166def577558..1ee86c437610d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUConvertWaveSize.cpp @@ -47,7 +47,7 @@ using namespace llvm; -#define DEBUG_TYPE "si-convert-wave-size" +#define DEBUG_TYPE "amdgpu-convert-wave-size" namespace { class AMDGPUConvertWaveSize { @@ -68,8 +68,6 @@ class AMDGPUConvertWaveSize { : TM(TM), LI(LI), SE(SE), TTI(TTI) {} bool run(Function &F); - - bool changeWaveSizeAttr(Function *F); }; class AMDGPUConvertWaveSizeLegacy : public FunctionPass { @@ -123,22 +121,25 @@ bool AMDGPUConvertWaveSize::run(Function &F) { } } - // Check if the function can be called via device enqueue. - bool addressEscapes = false; - if (!F.use_empty()) { - const Module *M = F.getParent(); - for (const GlobalVariable &GV : M->globals()) { - if (GV.hasInitializer()) { - if (const Constant *Init = GV.getInitializer()) { - if (isa(Init) && Init == &F) { - addressEscapes = true; - } + // Check for static LDS uses + const Module *M = F.getParent(); + for (const GlobalVariable &GV : M->globals()) { + if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + + for (auto User : GV.users()) { + if (auto UseI = dyn_cast(User)) { + if (UseI->getFunction() == &F) { + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Global variable " << GV + << " points to LDS object and is used\n"); + return false; } } } } - if (addressEscapes) { + // Check if the kernel can be called via device enqueue. + if (F.hasAddressTaken()) { LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Kernel address is taken.\n"); return false; } @@ -184,8 +185,6 @@ bool AMDGPUConvertWaveSize::run(Function &F) { if (Callee->hasFnAttribute(Attribute::Convergent)) { if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) { // TODO: what else should go in a "white list" ? - // Intrinsic::amdgcn_s_barrier_wavefront ? - // Intrinsic::amdgcn_s_barrier_signal ? LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Convergent intrinsic " << Callee->getName() << " detected.\n"); @@ -202,57 +201,6 @@ bool AMDGPUConvertWaveSize::run(Function &F) { return false; } - // Take care of LDS access - if (const auto *MTI = dyn_cast(&I)) { - auto DstAS = MTI->getDestAddressSpace(); - auto SrcAS = MTI->getSourceAddressSpace(); - if (DstAS == AMDGPUAS::LOCAL_ADDRESS || - SrcAS == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access " - "(llvm.memcpy/memmove) detected.\n"); - return false; - } - } else if (const auto *MSI = dyn_cast(&I)) { - auto DstAS = MSI->getDestAddressSpace(); - if (DstAS == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access " - "(llvm.memset) detected.\n"); - return false; - } - } else if (const auto AMCI = dyn_cast(&I)) { - auto DstAS = AMCI->getDestAddressSpace(); - auto SrcAS = AMCI->getSourceAddressSpace(); - if (DstAS == AMDGPUAS::LOCAL_ADDRESS || - SrcAS == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG( - dbgs() - << "AMDGPUConvertWaveSize: LDS access " - "(llvm.memcpy.element.unordered.atomic) detected\n"); - return false; - } - } else - if (const auto AMMI = dyn_cast(&I)) { - auto DstAS = AMMI->getDestAddressSpace(); - auto SrcAS = AMMI->getSourceAddressSpace(); - if (DstAS == AMDGPUAS::LOCAL_ADDRESS || - SrcAS == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG( - dbgs() - << "AMDGPUConvertWaveSize: LDS access " - "(llvm.memmove.element.unordered.atomic) detected.\n"); - return false; - } - } else if (const auto *AMSI = dyn_cast(&I)) { - auto DstAS = AMSI->getDestAddressSpace(); - if (DstAS == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG( - dbgs() - << "AMDGPUConvertWaveSize: LDS access " - "(llvm.memset.element.unordered.atomic) detected.\n"); - return false; - } - } - // Save callee as a candidate for attribute change Callees.push_back(Callee); } @@ -271,7 +219,9 @@ bool AMDGPUConvertWaveSize::run(Function &F) { if (const auto AC = dyn_cast(&I)) { if (AC->getDestTy()->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); + LLVM_DEBUG( + dbgs() + << "AMDGPUConvertWaveSize: addrspacecast to LDS detected.\n"); return false; } } @@ -280,36 +230,14 @@ bool AMDGPUConvertWaveSize::run(Function &F) { if (I2P->getDestTy()->isPointerTy() && I2P->getDestTy()->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: convertion int to LDS " + "pointer detected.\n"); return false; } } - // GEP may refer to the global LDS object - if (const auto GEP = dyn_cast(&I)) { - if (GEP->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); - return false; - } - } - - // Load/Store/Atomics may directly use global LDS object - if (const auto LI = dyn_cast(&I)) { - if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); - return false; - } - } - if (const auto SI = dyn_cast(&I)) { - if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: LDS access detected.\n"); - return false; - } - } - - if (const auto MemIntr = dyn_cast(&I)) - // TODO: Dynamic VGPRS and GFX11+ special operations ??? + BlockCost += TTI->getInstructionCost(&I, TargetTransformInfo::TCK_Latency); } @@ -340,41 +268,22 @@ bool AMDGPUConvertWaveSize::run(Function &F) { // Additional checks can be added here... // If all checks pass, convert wave size from wave32 to wave64. - // Conversion logic goes here... - bool Changed = changeWaveSizeAttr(&F); - if (Changed) - // Now take care of the intrinsic calls - for (auto C : Callees) { - // TODO: if we could not change Attr for one of the callee - // we need to rollback all the changes! - changeWaveSizeAttr(C); - } + F.addFnAttr("target-features", "+wavefrontsize64"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for " + << F.getName() << " from wave32 to wave64.\n"); + // Now take care of the intrinsic calls + for (auto C : Callees) { + C->addFnAttr("target-features", "+wavefrontsize64"); + LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for " + << C->getName() << " from wave32 to wave64.\n"); + } - return Changed; + return true; } -bool AMDGPUConvertWaveSize::changeWaveSizeAttr(Function *F) { - auto Attr = F->getFnAttribute("target-features"); - if (Attr.isValid()) { - StringRef AttrStr = Attr.getValueAsString(); - size_t Pos = AttrStr.find("+wavefrontsize32"); - if (Pos != StringRef::npos) { - // Remove the "+wavefrontsize32" attribute. - std::string NewBegin = AttrStr.substr(0, Pos).str().append("+wavefrontsize64"); - std::string End = AttrStr.substr(Pos + strlen("+wavefrontsize32")).str(); - std::string NewAttrStr = NewBegin + End; - // Add the "+wavefrontsize64" attribute. - F->removeFnAttr("target-features"); - F->addFnAttr("target-features", NewAttrStr); - LLVM_DEBUG(dbgs() << "AMDGPUConvertWaveSize: Converted wave size for " - << F->getName() - << " from wave32 " - "to wave64.\n"); - return true; - } - } - return false; -} +//===----------------------------------------------------------------------===// +// Pass registration +//===----------------------------------------------------------------------===// INITIALIZE_PASS_BEGIN(AMDGPUConvertWaveSizeLegacy, DEBUG_TYPE, "AMDGPU convert wave size", false, false) diff --git a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll index d90e524e9cc2e..f43dc3235a05d 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32-to-64-auto-convert.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=si-convert-wave-size < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-convert-wave-size < %s | FileCheck %s define amdgpu_kernel void @test_not_wave32(ptr addrspace(1) %out) #0 { ; CHECK: @test_not_wave32{{.*}}) #0 @@ -66,6 +66,33 @@ define amdgpu_kernel void @test_lds_access(ptr addrspace(3) %out) #1 { ret void } +define amdgpu_kernel void @test_addrspacecast_to_lds(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(1) %in, i32 16 + %ptr = addrspacecast ptr addrspace(1) %gep to ptr addrspace(3) + %val = load i32, ptr addrspace(3) %ptr + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_bitcast_to_lds_ptr(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(1) %in, i32 16 + %lds = inttoptr i32 0 to ptr addrspace(3) + %val = load i32, ptr addrspace(3) %lds + store i32 %val, ptr addrspace(1) %out + ret void +} + +@lds = addrspace(3) global [256 x i32] zeroinitializer + +define amdgpu_kernel void @test_use_global_lds_object(ptr addrspace(1) %out, i1 %p) #0 { + %gep = getelementptr [256 x i32], ptr addrspace(3) @lds, i32 0, i32 10 + %ld = load i32, ptr addrspace(3) %gep + store i32 %ld, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @test_simple_loop(ptr addrspace(1) nocapture %arg) #1 { ; CHECK: @test_simple_loop{{.*}}) #1 bb: