diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 259b6c040706b..65f398af7902b 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/Format.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" using namespace clang; using namespace CodeGen; @@ -72,6 +73,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() llvm::GlobalVariable *GpuBinaryHandle = nullptr; + /// Host-side shadow for the per-TU __llvm_profile_sections_ global, + /// emitted only for HIP host compiles when PGO is on. Registered via + /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime + /// can locate the device-side table by name. + llvm::GlobalVariable *OffloadProfShadow = nullptr; /// Whether we generate relocatable device code. bool RelocatableDeviceCode; /// Mangle context for device. @@ -176,6 +182,13 @@ class CGNVCUDARuntime : public CGCUDARuntime { void transformManagedVars(); /// Create offloading entries to register globals in RDC mode. void createOffloadingEntries(); + /// For HIP+PGO, emit the per-TU __llvm_profile_sections_ global. + /// On the device side it is the populated 7-pointer section-bounds table. + /// On the host side it is a placeholder void* shadow stored in + /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC) + /// or createOffloadingEntries (RDC) so the runtime can locate the + /// device-side table by name. + void emitOffloadProfilingSections(); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -735,6 +748,32 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { } } + // Register the per-TU offload-profiling shadow so the host runtime can + // locate the matching device-side __llvm_profile_sections_. We + // emit both __hipRegisterVar (so the HIP runtime can map the host + // shadow to the device symbol) and + // __llvm_profile_offload_register_shadow_variable (so the profile + // runtime adds the shadow to its drain list). + if (OffloadProfShadow) { + llvm::Constant *Name = + makeConstantString(std::string(OffloadProfShadow->getName())); + llvm::Value *RegisterVarArgs[] = { + &GpuBinaryHandlePtr, + OffloadProfShadow, + Name, + Name, + llvm::ConstantInt::get(IntTy, /*Extern=*/0), + llvm::ConstantInt::get(VarSizeTy, CGM.getDataLayout().getPointerSize()), + llvm::ConstantInt::get(IntTy, /*Constant=*/0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, RegisterVarArgs); + + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + Builder.CreateCall(RegisterShadow, {OffloadProfShadow}); + } + Builder.CreateRetVoid(); return RegisterKernelsFunc; } @@ -1256,11 +1295,124 @@ void CGNVCUDARuntime::createOffloadingEntries() { I.Flags.getSurfTexType()); } } + + // Register the per-TU offload-profiling shadow. The offloading entry + // makes the linker-wrapper emit the host __hipRegisterVar call in the + // combined ctor. Separately emit a per-TU ctor that registers the + // shadow with the profile runtime's drain list. + if (OffloadProfShadow) { + llvm::offloading::emitOffloadingEntry( + M, Kind, OffloadProfShadow, OffloadProfShadow->getName(), + CGM.getDataLayout().getPointerSize(), + llvm::offloading::OffloadGlobalEntry, /*Data=*/0); + + llvm::LLVMContext &Ctx = M.getContext(); + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_shadow_variable"); + auto *CtorFn = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, false), + llvm::GlobalValue::InternalLinkage, + "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), &M); + auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn); + llvm::IRBuilder<> B(Entry); + B.CreateCall(RegisterShadow, {OffloadProfShadow}); + B.CreateRetVoid(); + llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535); + } +} + +// For HIP host+device compiles with PGO enabled, emit the per-TU global +// __llvm_profile_sections_. Device side: a 7-pointer struct holding +// section start/stop bounds for the names/counters/data sections plus the +// raw-version variable. Host side: an opaque void* shadow whose only +// purpose is to give the host-runtime a registered symbol name to look up +// via hipGetSymbolAddress; the actual device-side data lives in the +// matching device-side global. +void CGNVCUDARuntime::emitOffloadProfilingSections() { + if (!CGM.getLangOpts().HIP) + return; + if (!CGM.getCodeGenOpts().hasProfileInstr()) + return; + + StringRef CUIDHash = CGM.getContext().getCUIDHash(); + if (CUIDHash.empty()) + return; + + llvm::Module &M = CGM.getModule(); + llvm::LLVMContext &Ctx = M.getContext(); + std::string Name = ("__llvm_profile_sections_" + CUIDHash).str(); + + // If the global already exists (e.g. another TU was merged in), don't + // duplicate it. + if (M.getNamedValue(Name)) + return; + + if (CGM.getLangOpts().CUDAIsDevice) { + // Device side: emit the populated struct. Section start/stop symbols + // are linker-defined (ELF auto-generates __start_/__stop_ for any + // section whose name is a valid C identifier; AMDGPU is ELF). + unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); + auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS); + auto getOrDeclare = [&](StringRef SymName) { + if (auto *GV = M.getNamedGlobal(SymName)) + return GV; + auto *GV = new llvm::GlobalVariable( + M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName, + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::HiddenVisibility); + return GV; + }; + auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version"); + if (!VersionGV) { + VersionGV = new llvm::GlobalVariable( + M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true, + llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, + "__llvm_profile_raw_version", + /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + } + + auto *StructTy = llvm::StructType::get( + Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); + llvm::Constant *Fields[] = { + getOrDeclare("__start___llvm_prf_names"), + getOrDeclare("__stop___llvm_prf_names"), + getOrDeclare("__start___llvm_prf_cnts"), + getOrDeclare("__stop___llvm_prf_cnts"), + getOrDeclare("__start___llvm_prf_data"), + getOrDeclare("__stop___llvm_prf_data"), + VersionGV, + }; + auto *Init = llvm::ConstantStruct::get(StructTy, Fields); + auto *GV = new llvm::GlobalVariable( + M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, + Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, + GlobalAS); + GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); + CGM.addCompilerUsedGlobal(GV); + return; + } + + // Host side: emit an opaque void* shadow. Layout doesn't matter — the + // runtime locates it by name via hipGetSymbolAddress and treats it as + // the address of the device-side struct. Registration with the HIP + // runtime is added by makeRegisterGlobalsFn (non-RDC) or + // createOffloadingEntries (RDC). + auto *PtrTy = llvm::PointerType::getUnqual(Ctx); + OffloadProfShadow = new llvm::GlobalVariable( + M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), Name); + CGM.addCompilerUsedGlobal(OffloadProfShadow); } // Returns module constructor to be added. llvm::Function *CGNVCUDARuntime::finalizeModule() { transformManagedVars(); + emitOffloadProfilingSections(); if (CGM.getLangOpts().CUDAIsDevice) { // Mark ODR-used device variables as compiler used to prevent it from being // eliminated by optimization. This is necessary for device variables diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 5f04afe34c554..d3c94c8addffa 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -902,6 +902,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, if (ActiveKinds & Action::OFK_HIP) CmdArgs.push_back( Args.MakeArgString(StringRef("-L") + RocmInstallation->getLibPath())); + + // For HIP device PGO, link clang_rt.profile_rocm when available. It is a + // self-contained superset of clang_rt.profile, emitted first so the base + // archive stays inert. + if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) && + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); } void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs, diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 6bc58699fb007..8141f9f132421 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -592,6 +592,13 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, CmdArgs.append({Args.MakeArgString(StringRef("-libpath:") + RocmInstallation->getLibPath()), "amdhip64.lib"}); + + // For HIP device PGO, link clang_rt.profile_rocm when available. It is a + // self-contained superset of clang_rt.profile, emitted first so the base + // archive stays inert (avoiding a /MD-vs-/MT CRT mix in the host image). + if (needsProfileRT(Args) && + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); } } diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip b/clang/test/CodeGenHIP/offload-pgo-sections.hip new file mode 100644 index 0000000000000..17c6fe7b9e609 --- /dev/null +++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip @@ -0,0 +1,50 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// Verify CGCUDANV emits the per-TU __llvm_profile_sections_ global +// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct +// in addrspace(1). Host compile: void* shadow registered with the HIP +// runtime and with the profile runtime's drain list. + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=DEV %s + +// RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=HOST %s + +// Guard: no PGO -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +// Guard: no CUID -> no emission. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=NONE %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ int helper(int x) { return x + 1; } +__global__ void kernel(int *p) { *p = helper(*p); } + +// DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_cnts = external hidden addrspace(1) global i8 +// DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8 +// DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64 +// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version +// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] + +// HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] +// HOST: define internal void @__hip_register_globals +// HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]], +// HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) + +// NONE-NOT: __llvm_profile_sections_ +// NONE-NOT: __llvm_profile_offload_register_shadow_variable diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip b/clang/test/Driver/hip-profile-rocm-runtime.hip new file mode 100644 index 0000000000000..5e99d3f4f2fee --- /dev/null +++ b/clang/test/Driver/hip-profile-rocm-runtime.hip @@ -0,0 +1,31 @@ +// REQUIRES: x86-registered-target, amdgpu-registered-target +// UNSUPPORTED: system-windows + +// Build a fake resource dir containing both the base profile runtime and the +// ROCm device-profile runtime so the driver's existence check passes. +// RUN: rm -rf %t && mkdir -p %t/lib/x86_64-unknown-linux +// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile.a +// RUN: touch %t/lib/x86_64-unknown-linux/libclang_rt.profile_rocm.a +// RUN: touch %t.o + +// HIP host link with PGO links clang_rt.profile_rocm. +// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \ +// RUN: -fprofile-instr-generate -resource-dir=%t \ +// RUN: --rocm-path=%S/Inputs/rocm %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=HIP-PGO %s +// profile_rocm must precede the base profile so the base archive stays inert. +// HIP-PGO: "{{.*}}libclang_rt.profile_rocm.a" +// HIP-PGO: "{{.*}}libclang_rt.profile.a" + +// Without profiling, the ROCm device-profile runtime is not linked. +// RUN: %clang -### --hip-link --target=x86_64-unknown-linux \ +// RUN: -resource-dir=%t --rocm-path=%S/Inputs/rocm %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=HIP-NOPGO %s +// HIP-NOPGO-NOT: libclang_rt.profile_rocm.a + +// A non-HIP host link with PGO does not link the ROCm device-profile runtime. +// RUN: %clang -### --target=x86_64-unknown-linux \ +// RUN: -fprofile-instr-generate -resource-dir=%t %t.o 2>&1 \ +// RUN: | FileCheck -check-prefix=HOST-PGO %s +// HOST-PGO: "{{.*}}libclang_rt.profile.a" +// HOST-PGO-NOT: libclang_rt.profile_rocm.a diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index cbdc48a9a717f..37bad559f49e7 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -500,6 +500,9 @@ def DEOPTIMIZE : RuntimeLibcall; // Return address def RETURN_ADDRESS : RuntimeLibcall; +// GPU profiling +def PROFILE_INSTRUMENT_GPU : RuntimeLibcall; + // Clear cache def CLEAR_CACHE : RuntimeLibcall; def RISCV_FLUSH_ICACHE : RuntimeLibcall; @@ -2241,8 +2244,10 @@ def WindowsARM64ECSystemLibrary def isAMDGPU : RuntimeLibcallPredicate<"TT.isAMDGPU()">; -// No calls. -def AMDGPUSystemLibrary : SystemRuntimeLibrary; +def __llvm_profile_instrument_gpu : RuntimeLibcallImpl; + +def AMDGPUSystemLibrary + : SystemRuntimeLibrary; //===----------------------------------------------------------------------===// // ARM Runtime Libcalls diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 33e55960509a7..90471e910bb50 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -122,6 +122,11 @@ inline StringRef getInstrProfValueProfMemOpFuncName() { /// Return the prefix of the name of the variables to function as a filter. inline StringRef getInstrProfVarPrefix() { return "__prof"; } +/// Return the name of the GPU wave-cooperative counter increment helper. +inline StringRef getInstrProfInstrumentGPUFuncName() { + return INSTR_PROF_INSTRUMENT_GPU_FUNC_STR; +} + /// Return the name prefix of variables containing instrumented function names. inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; } diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index dabd495cddd49..8e4ba41919768 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -33,14 +34,17 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" +#include "llvm/IR/RuntimeLibcalls.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" @@ -287,6 +291,8 @@ class InstrLowerer final { GlobalVariable *NamesVar = nullptr; size_t NamesSize = 0; + StructType *ProfileDataTy = nullptr; + // vector of counter load/store pairs to be register promoted. std::vector PromotionCandidates; @@ -407,6 +413,9 @@ class InstrLowerer final { /// Create a static initializer for our data, on platforms that need it, /// and for any profile output file that was specified. void emitInitialization(); + + /// Return the __llvm_profile_data struct type. + StructType *getProfileDataTy(); }; /// @@ -1190,19 +1199,22 @@ void InstrLowerer::lowerTimestamp( void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) { auto *Addr = getCounterAddress(Inc); - IRBuilder<> Builder(Inc); if (isGPUProfTarget(M)) { - auto *I64Ty = Builder.getInt64Ty(); + auto *Int64Ty = Builder.getInt64Ty(); auto *PtrTy = Builder.getPtrTy(); auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()), - {PtrTy, PtrTy, I64Ty}, false); - auto Callee = - M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy); + {PtrTy, PtrTy, Int64Ty}, false); + FunctionCallee Callee = + M.getOrInsertFunction(RTLIB::RuntimeLibcallsInfo::getLibcallImplName( + RTLIB::impl___llvm_profile_instrument_gpu), + CalleeTy); Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy); Value *Uniform = ConstantPointerNull::get(PointerType::getUnqual(M.getContext())); - Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()}); + Value *StepI64 = + Builder.CreateZExtOrTrunc(Inc->getStep(), Int64Ty, "step.i64"); + Builder.CreateCall(Callee, {CastAddr, Uniform, StepI64}); } else if (Options.Atomic || AtomicCounterUpdateAll || (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) { Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(), @@ -1400,6 +1412,12 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) { if (shouldUsePublicSymbol(Fn)) return Fn; + // For GPU targets, weak functions cannot use private aliases because + // LTO may pick a different TU's copy, leaving the alias undefined + if (isGPUProfTarget(*Fn->getParent()) && + GlobalValue::isWeakForLinker(Fn->getLinkage())) + return Fn; + // When possible use a private alias to avoid symbolic relocations. auto *GA = GlobalAlias::create(GlobalValue::LinkageTypes::PrivateLinkage, Fn->getName() + ".local", Fn); @@ -1623,11 +1641,15 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc, } Ptr->setVisibility(Visibility); - // Put the counters and bitmaps in their own sections so linkers can - // remove unneeded sections. Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat())); Ptr->setLinkage(Linkage); - maybeSetComdat(Ptr, Fn, VarName); + if (isGPUProfTarget(M) && !Ptr->hasComdat()) { + Ptr->setComdat(M.getOrInsertComdat(VarName)); + Ptr->setLinkage(GlobalValue::LinkOnceODRLinkage); + Ptr->setVisibility(GlobalValue::ProtectedVisibility); + } else { + maybeSetComdat(Ptr, Fn, VarName); + } return Ptr; } @@ -1799,7 +1821,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { } uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); - auto *CounterPtr = PD.RegionCounters; + + Constant *CounterPtr = PD.RegionCounters; uint64_t NumBitmapBytes = PD.NumBitmapBytes; @@ -1807,11 +1830,7 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); auto *Int16Ty = Type::getInt16Ty(Ctx); auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1); - Type *DataTypes[] = { -#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, -#include "llvm/ProfileData/InstrProfData.inc" - }; - auto *DataTy = StructType::get(Ctx, ArrayRef(DataTypes)); + auto *DataTy = getProfileDataTy(); Constant *FunctionAddr = getFuncAddrForProfData(Fn); @@ -1819,6 +1838,15 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + if (isGPUProfTarget(M)) { + // For GPU targets, weak functions need weak linkage for their profile data + // aliases to allow linker deduplication across TUs + if (GlobalValue::isWeakForLinker(Fn->getLinkage())) + Linkage = Fn->getLinkage(); + else + Linkage = GlobalValue::ExternalLinkage; + Visibility = GlobalValue::ProtectedVisibility; + } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1830,19 +1858,22 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + if (!isGPUProfTarget(M) && NS == 0 && + !(DataReferencedByCode && NeedComdat && !Renamed) && (TT.isOSBinFormatELF() || (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } - // AMDGPU objects are always ET_DYN, so non-local symbols with default - // visibility are preemptible. The CounterPtr label difference emits a REL32 - // relocation that lld rejects against preemptible targets. - if (TT.isAMDGPU() && !GlobalValue::isLocalLinkage(Linkage)) + // GPU-target ELF objects are always ET_DYN, so non-local symbols with + // default visibility are preemptible. The CounterPtr label difference + // emits a REL32 relocation that lld rejects against preemptible targets. + if (TT.isGPU() && TT.isOSBinFormatELF() && + !GlobalValue::isLocalLinkage(Linkage)) Visibility = GlobalValue::ProtectedVisibility; auto *Data = new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName); + Constant *RelativeCounterPtr; GlobalVariable *BitmapPtr = PD.RegionBitmaps; Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0); @@ -1883,7 +1914,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { Data->setSection( getInstrProfSectionName(DataSectionKind, TT.getObjectFormat())); Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT)); - maybeSetComdat(Data, Fn, CntsVarName); + if (isGPUProfTarget(M) && !Data->hasComdat()) { + Data->setComdat(M.getOrInsertComdat(CntsVarName)); + Data->setLinkage(GlobalValue::LinkOnceODRLinkage); + } else { + maybeSetComdat(Data, Fn, CntsVarName); + } PD.DataVar = Data; @@ -1961,16 +1997,18 @@ void InstrLowerer::emitNameData() { auto &Ctx = M.getContext(); auto *NamesVal = ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false); - NamesVar = new GlobalVariable(M, NamesVal->getType(), true, - GlobalValue::PrivateLinkage, NamesVal, - getInstrProfNamesVarName()); + std::string NamesVarName = std::string(getInstrProfNamesVarName()); + NamesVar = + new GlobalVariable(M, NamesVal->getType(), true, + GlobalValue::PrivateLinkage, NamesVal, NamesVarName); NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); - NamesVar->setSection( + std::string NamesSectionName = ProfileCorrelate == InstrProfCorrelator::BINARY ? getInstrProfSectionName(IPSK_covname, TT.getObjectFormat()) - : getInstrProfSectionName(IPSK_name, TT.getObjectFormat())); + : getInstrProfSectionName(IPSK_name, TT.getObjectFormat()); + NamesVar->setSection(NamesSectionName); // On COFF, it's important to reduce the alignment down to 1 to prevent the // linker from inserting padding before the start of the names section or // between names entries. @@ -2179,3 +2217,22 @@ void createProfileSamplingVar(Module &M) { appendToCompilerUsed(M, SamplingVar); } } // namespace llvm + +// For GPU targets: Allocate contiguous arrays for all profile data. +// This solves the linker reordering problem by using ONE symbol per section +// type, so there's nothing for the linker to reorder. +StructType *InstrLowerer::getProfileDataTy() { + if (ProfileDataTy) + return ProfileDataTy; + + auto &Ctx = M.getContext(); + auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); + auto *Int16Ty = Type::getInt16Ty(Ctx); + auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1); + Type *DataTypes[] = { +#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, +#include "llvm/ProfileData/InstrProfData.inc" + }; + ProfileDataTy = StructType::get(Ctx, ArrayRef(DataTypes)); + return ProfileDataTy; +} diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index db032d6fcad45..b6d07aa821e7f 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -380,7 +380,8 @@ class FunctionInstrumenter final { // another counter range within the context. bool isValueProfilingDisabled() const { return DisableValueProfiling || - InstrumentationType == PGOInstrumentationType::CTXPROF; + InstrumentationType == PGOInstrumentationType::CTXPROF || + M.getTargetTriple().isGPU(); } bool shouldInstrumentEntryBB() const { diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll new file mode 100644 index 0000000000000..efe53ab1ebdfb --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-instrumentation.ll @@ -0,0 +1,32 @@ +;; Test basic AMDGPU PGO instrumentation lowering. +;; Verifies that each instrumentation point lowers directly to a call to +;; __llvm_profile_instrument_gpu with a null uniform-counter argument. + +; RUN: opt %s -mtriple=amdgcn-amd-amdhsa -passes=instrprof -S | FileCheck %s + +@__hip_cuid_test01 = addrspace(1) global i8 0 +@__profn_test_kernel = private constant [11 x i8] c"test_kernel" + +define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out, i32 %n) { +entry: + call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 0) + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + call void @llvm.instrprof.increment(ptr @__profn_test_kernel, i64 111, i32 4, i32 1) + store i32 1, ptr addrspace(1) %out + br label %if.end + +if.end: + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) + +; CHECK-LABEL: define {{.*}} @test_kernel +; CHECK-NOT: @__llvm_profile_sampling_gpu +; CHECK: call void @__llvm_profile_instrument_gpu( +; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) @__profc_test_kernel to ptr), ptr null, i64 1) +; CHECK: call void @__llvm_profile_instrument_gpu( +; CHECK-SAME: ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([4 x i64], ptr addrspace(1) @__profc_test_kernel, i32 0, i32 1) to ptr), ptr null, i64 1) diff --git a/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll new file mode 100644 index 0000000000000..eab78fb3591b1 --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/amdgpu-profc-arrays.ll @@ -0,0 +1,26 @@ +;; Per-kernel __profc_* arrays land in section __llvm_prf_cnts with one slot +;; per counter, and counter increments lower to __llvm_profile_instrument_gpu +;; calls whose pointer argument is a GEP into the per-kernel array. + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instrprof < %s | FileCheck %s + +@__profn_kernel1 = private constant [7 x i8] c"kernel1" +@__profn_kernel2 = private constant [7 x i8] c"kernel2" + +; CHECK: @__profc_kernel1 = linkonce_odr protected addrspace(1) global [2 x i64] zeroinitializer, section "__llvm_prf_cnts" +; CHECK: @__profc_kernel2 = linkonce_odr protected addrspace(1) global [1 x i64] zeroinitializer, section "__llvm_prf_cnts" + +define amdgpu_kernel void @kernel1() { + call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 0) + call void @llvm.instrprof.increment(ptr @__profn_kernel1, i64 12345, i32 2, i32 1) + ret void +} + +define amdgpu_kernel void @kernel2() { + call void @llvm.instrprof.increment(ptr @__profn_kernel2, i64 67890, i32 1, i32 0) + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) + +; CHECK: call void @__llvm_profile_instrument_gpu(ptr addrspacecast (ptr addrspace(1) getelementptr inbounds ([2 x i64], ptr addrspace(1) @__profc_kernel1, i32 0, i32 1) to ptr), ptr null, i64 1) diff --git a/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll new file mode 100644 index 0000000000000..ce16f1ee3215f --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/gpu-weak.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -passes=instrprof -S | FileCheck %s + +; Test that weak functions on GPU targets get weak linkage for their +; __profd_ aliases to allow linker deduplication across TUs. +; Non-weak functions get external linkage (default for aliases). + +target triple = "amdgcn-amd-amdhsa" + +@__hip_cuid_abc123 = addrspace(1) global i8 0 + +; AMDGPU GPU profiling lowers to per-function comdat globals (not aliases). +; CHECK: @__profd_weak_func = linkonce_odr protected addrspace(1) global +@__profn_weak_func = private constant [9 x i8] c"weak_func" + +define weak void @weak_func() { + call void @llvm.instrprof.increment(ptr @__profn_weak_func, i64 0, i32 1, i32 0) + ret void +} + +; CHECK: @__profd_weak_odr_func = linkonce_odr protected addrspace(1) global +@__profn_weak_odr_func = private constant [13 x i8] c"weak_odr_func" + +define weak_odr void @weak_odr_func() { + call void @llvm.instrprof.increment(ptr @__profn_weak_odr_func, i64 0, i32 1, i32 0) + ret void +} + +; CHECK: @__profd_normal_func = linkonce_odr protected addrspace(1) global +@__profn_normal_func = private constant [11 x i8] c"normal_func" + +define void @normal_func() { + call void @llvm.instrprof.increment(ptr @__profn_normal_func, i64 0, i32 1, i32 0) + ret void +} + +declare void @llvm.instrprof.increment(ptr, i64, i32, i32) diff --git a/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll new file mode 100644 index 0000000000000..21b1d68004b13 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/amdgpu-disable-value-profiling.ll @@ -0,0 +1,22 @@ +;; Test that value profiling (indirect call profiling) is disabled for GPU targets. +;; The device-side profiling runtime does not implement +;; __llvm_profile_instrument_target, so indirect call profiling must not be emitted. + +; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +@fptr = addrspace(1) global ptr null, align 8 + +;; Verify that regular block instrumentation IS emitted +; CHECK: call void @llvm.instrprof.increment + +;; Verify that value profiling for indirect calls is NOT emitted +; CHECK-NOT: call void @llvm.instrprof.value.profile + +define amdgpu_kernel void @test_indirect_call() { +entry: + %fp = load ptr, ptr addrspace(1) @fptr, align 8 + call void %fp() + ret void +}