diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 65f398af7902b..f19813db189ef 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/ReplaceConstant.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Format.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -37,6 +38,10 @@ namespace { constexpr unsigned CudaFatMagic = 0x466243b1; constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" +static std::string getOffloadProfilingNamesVarName(llvm::StringRef CUIDHash) { + return (llvm::Twine(llvm::getInstrProfNamesVarName()) + "_" + CUIDHash).str(); +} + class CGNVCUDARuntime : public CGCUDARuntime { /// The prefix used for function calls and section names (CUDA, HIP, LLVM) @@ -78,6 +83,11 @@ class CGNVCUDARuntime : public CGCUDARuntime { /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime /// can locate the device-side table by name. llvm::GlobalVariable *OffloadProfShadow = nullptr; + struct OffloadProfSectionShadowInfo { + llvm::GlobalVariable *Shadow; + std::string DeviceName; + }; + llvm::SmallVector OffloadProfSectionShadows; /// Whether we generate relocatable device code. bool RelocatableDeviceCode; /// Mangle context for device. @@ -774,6 +784,27 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { Builder.CreateCall(RegisterShadow, {OffloadProfShadow}); } + if (!OffloadProfSectionShadows.empty()) { + llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_section_shadow_variable"); + for (const auto &Info : OffloadProfSectionShadows) { + llvm::Constant *Name = makeConstantString(Info.DeviceName); + llvm::Value *RegisterVarArgs[] = { + &GpuBinaryHandlePtr, + Info.Shadow, + Name, + Name, + llvm::ConstantInt::get(IntTy, /*Extern=*/0), + llvm::ConstantInt::get(VarSizeTy, + CGM.getDataLayout().getPointerSize()), + llvm::ConstantInt::get(IntTy, /*Constant=*/0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, RegisterVarArgs); + Builder.CreateCall(RegisterSectionShadow, {Info.Shadow}); + } + } + Builder.CreateRetVoid(); return RegisterKernelsFunc; } @@ -1311,6 +1342,9 @@ void CGNVCUDARuntime::createOffloadingEntries() { llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidTy, {PtrTy}, false), "__llvm_profile_offload_register_shadow_variable"); + llvm::FunctionCallee RegisterSectionShadow = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, {PtrTy}, false), + "__llvm_profile_offload_register_section_shadow_variable"); auto *CtorFn = llvm::Function::Create( llvm::FunctionType::get(VoidTy, false), llvm::GlobalValue::InternalLinkage, @@ -1318,6 +1352,13 @@ void CGNVCUDARuntime::createOffloadingEntries() { auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn); llvm::IRBuilder<> B(Entry); B.CreateCall(RegisterShadow, {OffloadProfShadow}); + for (const auto &Info : OffloadProfSectionShadows) { + llvm::offloading::emitOffloadingEntry( + M, Kind, Info.Shadow, Info.DeviceName, + CGM.getDataLayout().getPointerSize(), + llvm::offloading::OffloadGlobalEntry, /*Data=*/0); + B.CreateCall(RegisterSectionShadow, {Info.Shadow}); + } B.CreateRetVoid(); llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535); } @@ -1350,50 +1391,23 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() { return; if (CGM.getLangOpts().CUDAIsDevice) { - // Device side: emit the populated struct. Section start/stop symbols - // are linker-defined (ELF auto-generates __start_/__stop_ for any - // section whose name is a valid C identifier; AMDGPU is ELF). + // Device side: emit only the per-TU names postfix marker. The sections + // struct is emitted later by the InstrProfiling pass, which emits it only + // when the TU has profile data, avoiding dangling section references. unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); - auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS); - auto getOrDeclare = [&](StringRef SymName) { - if (auto *GV = M.getNamedGlobal(SymName)) - return GV; - auto *GV = new llvm::GlobalVariable( - M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false, - llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName, - /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, - GlobalAS); - GV->setVisibility(llvm::GlobalValue::HiddenVisibility); - return GV; - }; - auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version"); - if (!VersionGV) { - VersionGV = new llvm::GlobalVariable( - M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true, - llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, - "__llvm_profile_raw_version", + std::string NamesVarPostfixVarName = + std::string(llvm::getInstrProfNamesVarPostfixVarName()); + if (!M.getNamedValue(NamesVarPostfixVarName)) { + auto *NamesVarPostfix = llvm::ConstantDataArray::getString( + Ctx, (llvm::Twine("_") + CUIDHash).str(), true); + auto *NamesGV = new llvm::GlobalVariable( + M, NamesVarPostfix->getType(), /*isConstant=*/true, + llvm::GlobalValue::PrivateLinkage, NamesVarPostfix, + NamesVarPostfixVarName, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, GlobalAS); + CGM.addCompilerUsedGlobal(NamesGV); } - - auto *StructTy = llvm::StructType::get( - Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); - llvm::Constant *Fields[] = { - getOrDeclare("__start___llvm_prf_names"), - getOrDeclare("__stop___llvm_prf_names"), - getOrDeclare("__start___llvm_prf_cnts"), - getOrDeclare("__stop___llvm_prf_cnts"), - getOrDeclare("__start___llvm_prf_data"), - getOrDeclare("__stop___llvm_prf_data"), - VersionGV, - }; - auto *Init = llvm::ConstantStruct::get(StructTy, Fields); - auto *GV = new llvm::GlobalVariable( - M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, - Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, - GlobalAS); - GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); - CGM.addCompilerUsedGlobal(GV); return; } @@ -1407,6 +1421,25 @@ void CGNVCUDARuntime::emitOffloadProfilingSections() { M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, llvm::ConstantPointerNull::get(PtrTy), Name); CGM.addCompilerUsedGlobal(OffloadProfShadow); + + auto AddSectionShadow = [&](StringRef Kind, StringRef DeviceName) { + std::string ShadowName = + (Twine("__llvm_profile_shadow_") + Kind + "_" + CUIDHash + "_" + + Twine(OffloadProfSectionShadows.size())) + .str(); + auto *Shadow = new llvm::GlobalVariable( + M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), ShadowName); + CGM.addCompilerUsedGlobal(Shadow); + OffloadProfSectionShadows.push_back({Shadow, DeviceName.str()}); + }; + + for (auto &&I : EmittedKernels) { + std::string KernelName = getDeviceSideName(cast(I.D)); + AddSectionShadow("data", std::string("__profd_") + KernelName); + AddSectionShadow("cnts", std::string("__profc_") + KernelName); + AddSectionShadow("names", getOffloadProfilingNamesVarName(CUIDHash)); + } } // Returns module constructor to be added. diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index d3c94c8addffa..65a15f9ed16e1 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -907,8 +907,13 @@ void Linux::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, // self-contained superset of clang_rt.profile, emitted first so the base // archive stays inert. if ((ActiveKinds & Action::OFK_HIP) && needsProfileRT(Args) && - getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) { CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); + // Force-retain the constructor-only hipModuleLoad* interceptor object; its + // constructor self-skips when the program does not use hipModuleLoad. + CmdArgs.push_back("-u"); + CmdArgs.push_back("__llvm_profile_offload_register_dynamic_module"); + } } void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs, diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 8141f9f132421..f7c055eeea405 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -597,8 +597,14 @@ void MSVCToolChain::addOffloadRTLibs(unsigned ActiveKinds, const ArgList &Args, // self-contained superset of clang_rt.profile, emitted first so the base // archive stays inert (avoiding a /MD-vs-/MT CRT mix in the host image). if (needsProfileRT(Args) && - getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) + getVFS().exists(getCompilerRT(Args, "profile_rocm", FT_Static))) { CmdArgs.push_back(getCompilerRTArgString(Args, "profile_rocm")); + // Force the linker to retain the constructor-only hipModuleLoad* + // interceptor object from clang_rt.profile_rocm (see Linux.cpp). The + // constructor self-skips for programs that do not use hipModuleLoad. + CmdArgs.push_back( + "-include:__llvm_profile_offload_register_dynamic_module"); + } } } diff --git a/clang/test/CodeGenHIP/offload-pgo-sections.hip b/clang/test/CodeGenHIP/offload-pgo-sections.hip index 17c6fe7b9e609..d21ba30012212 100644 --- a/clang/test/CodeGenHIP/offload-pgo-sections.hip +++ b/clang/test/CodeGenHIP/offload-pgo-sections.hip @@ -1,18 +1,26 @@ // REQUIRES: amdgpu-registered-target // REQUIRES: x86-registered-target -// Verify CGCUDANV emits the per-TU __llvm_profile_sections_ global -// for HIP+PGO compilations. Device subcompile: populated 7-pointer struct -// in addrspace(1). Host compile: void* shadow registered with the HIP -// runtime and with the profile runtime's drain list. +// Verify the per-TU __llvm_profile_sections_ global for HIP+PGO. +// Device side: clang emits the names-postfix marker, and the InstrProfiling +// pass emits the populated 7-pointer struct in addrspace(1) -- but only when +// the TU actually has profile data records. Host compile: void* shadow +// registered with the HIP runtime and the profile runtime's drain list. +// The device struct is emitted by the InstrProfiling pass (not clang codegen), +// so run the pass to observe it. // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ // RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | opt -passes=instrprof -S \ // RUN: | FileCheck -check-prefix=DEV %s // RUN: %clang_cc1 -triple x86_64-linux-gnu -cuid=abc \ // RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefix=HOST %s +// +// RUN: %clang_cc1 -triple x86_64-linux-gnu -fgpu-rdc --offload-new-driver \ +// RUN: -cuid=abc -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=HOST-RDC %s // Guard: no PGO -> no emission. // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ @@ -24,11 +32,25 @@ // RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefix=NONE %s +// Guard: PGO on but no instrumented device functions (all device code is +// constexpr/host-only) -> the pass must not emit the sections struct, so its +// section references don't dangle at the device link. +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \ +// RUN: -fprofile-instrument=clang -emit-llvm -o - -x hip \ +// RUN: -DEMPTY_DEVICE %s \ +// RUN: | opt -passes=instrprof -S \ +// RUN: | FileCheck -check-prefix=EMPTY %s + #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) +#ifdef EMPTY_DEVICE +// No __global__/instrumented device function: device code folds away. +__device__ constexpr int dead(int x) { return x + 1; } +#else __device__ int helper(int x) { return x + 1; } __global__ void kernel(int *p) { *p = helper(*p); } +#endif // DEV-DAG: @__start___llvm_prf_names = external hidden addrspace(1) global i8 // DEV-DAG: @__stop___llvm_prf_names = external hidden addrspace(1) global i8 @@ -37,14 +59,34 @@ __global__ void kernel(int *p) { *p = helper(*p); } // DEV-DAG: @__start___llvm_prf_data = external hidden addrspace(1) global i8 // DEV-DAG: @__stop___llvm_prf_data = external hidden addrspace(1) global i8 // DEV-DAG: @__llvm_profile_raw_version = external addrspace(1) constant i64 -// DEV: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version -// DEV: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] +// DEV-DAG: @__llvm_prf_nm_[[CUID:[0-9a-f]+]] = protected addrspace(1) constant {{.*}}section "__llvm_prf_names" +// DEV-DAG: @__llvm_profile_sections_[[CUID]] = protected addrspace(1) constant {{.*}}@__start___llvm_prf_names{{.*}}@__stop___llvm_prf_names{{.*}}@__start___llvm_prf_cnts{{.*}}@__stop___llvm_prf_cnts{{.*}}@__start___llvm_prf_data{{.*}}@__stop___llvm_prf_data{{.*}}@__llvm_profile_raw_version +// DEV-DAG: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] // HOST: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST-DAG: @__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}} = global ptr null // HOST: @llvm.compiler.used = {{.*}}@__llvm_profile_sections_[[CUID]] // HOST: define internal void @__hip_register_globals // HOST: call void @__hipRegisterVar({{.*}}@__llvm_profile_sections_[[CUID]], // HOST: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) +// HOST: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_names_[[CUID]]_{{[0-9]+}}) + +// HOST-RDC: @__llvm_profile_sections_[[CUID:[0-9a-f]+]] = global ptr null +// HOST-RDC-DAG: @__llvm_profile_shadow_data_[[CUID]]_0 = global ptr null +// HOST-RDC-DAG: @__llvm_profile_shadow_cnts_[[CUID]]_1 = global ptr null +// HOST-RDC-DAG: @__llvm_profile_shadow_names_[[CUID]]_2 = global ptr null +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_sections_[[CUID]] +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_data_[[CUID]]_0 +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_cnts_[[CUID]]_1 +// HOST-RDC-DAG: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry {{.*}}ptr @__llvm_profile_shadow_names_[[CUID]]_2 +// HOST-RDC: define internal void @__llvm_profile_register_shadow.[[CUID]]() +// HOST-RDC: call void @__llvm_profile_offload_register_shadow_variable(ptr @__llvm_profile_sections_[[CUID]]) +// HOST-RDC-DAG: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_data_[[CUID]]_0) +// HOST-RDC-DAG: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_cnts_[[CUID]]_1) +// HOST-RDC-DAG: call void @__llvm_profile_offload_register_section_shadow_variable(ptr @__llvm_profile_shadow_names_[[CUID]]_2) // NONE-NOT: __llvm_profile_sections_ // NONE-NOT: __llvm_profile_offload_register_shadow_variable + +// EMPTY-NOT: @__llvm_profile_sections_ +// EMPTY-NOT: @__start___llvm_prf_data diff --git a/clang/test/Driver/hip-profile-rocm-runtime.hip b/clang/test/Driver/hip-profile-rocm-runtime.hip index 5e99d3f4f2fee..fc82db4fc13c0 100644 --- a/clang/test/Driver/hip-profile-rocm-runtime.hip +++ b/clang/test/Driver/hip-profile-rocm-runtime.hip @@ -15,6 +15,8 @@ // RUN: | FileCheck -check-prefix=HIP-PGO %s // profile_rocm must precede the base profile so the base archive stays inert. // HIP-PGO: "{{.*}}libclang_rt.profile_rocm.a" +// The constructor-only hipModuleLoad* interceptor object is force-linked. +// HIP-PGO: "-u" "__llvm_profile_offload_register_dynamic_module" // HIP-PGO: "{{.*}}libclang_rt.profile.a" // Without profiling, the ROCm device-profile runtime is not linked. diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 9ea5a2638fac9..98a524392e54a 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1383,7 +1383,7 @@ int __llvm_write_custom_profile(const char *Target, TargetFilename[FilenameLength + 1 + TargetLength] = 0; /* Open and truncate target-specific PGO file */ - FILE *OutputFile = fopen(TargetFilename, "w"); + FILE *OutputFile = fopen(TargetFilename, "wb"); setProfileFile(OutputFile); if (!OutputFile) { @@ -1404,11 +1404,10 @@ int __llvm_write_custom_profile(const char *Target, if (VersionOverride) Version = *VersionOverride; - /* Write custom data to the file */ - ReturnValue = - lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin, - CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL, - NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version); + ReturnValue = lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, + CountersBegin, CountersEnd, NULL, NULL, + lprofGetVPDataReader(), NamesBegin, NamesEnd, + NULL, NULL, NULL, NULL, 0, Version); closeFileObject(OutputFile); // Restore SIGKILL. diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp index ee00c572e3a42..e972f22fdae8f 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp +++ b/compiler-rt/lib/profile/InstrProfilingPlatformROCm.cpp @@ -57,8 +57,9 @@ static void unlockDynamicModules(void) { } #endif -static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, - const char *Target); +struct OffloadSectionShadowGroup; +static int processDeviceOffloadPrf(void *DeviceOffloadPrf, const char *Target, + const OffloadSectionShadowGroup *Sections); static int isVerboseMode() { static int IsVerbose = -1; @@ -72,6 +73,7 @@ static int isVerboseMode() { /* -------------------------------------------------------------------------- */ typedef int (*hipGetSymbolAddressTy)(void **, const void *); +typedef int (*hipGetSymbolSizeTy)(size_t *, const void *); typedef int (*hipMemcpyTy)(void *, const void *, size_t, int); typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *); typedef int (*hipGetDeviceCountTy)(int *); @@ -88,6 +90,7 @@ typedef struct { typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int); static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr; +static hipGetSymbolSizeTy pHipGetSymbolSize = nullptr; static hipMemcpyTy pHipMemcpy = nullptr; static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr; static hipGetDeviceCountTy pHipGetDeviceCount = nullptr; @@ -113,17 +116,25 @@ static void doEnsureHipLoaded(void) { } #ifdef _WIN32 - static const char HipLibName[] = "amdhip64.dll"; + const char *HipLibName = "amdhip64_7.dll"; #else - static const char HipLibName[] = "libamdhip64.so"; + const char *HipLibName = "libamdhip64.so"; #endif void *Handle = __interception::OpenLibrary(HipLibName); +#ifdef _WIN32 + if (!Handle) { + HipLibName = "amdhip64.dll"; + Handle = __interception::OpenLibrary(HipLibName); + } +#endif if (!Handle) return; pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol( Handle, "hipGetSymbolAddress"); + pHipGetSymbolSize = (hipGetSymbolSizeTy)__interception::LookupSymbol( + Handle, "hipGetSymbolSize"); pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(Handle, "hipMemcpy"); pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol( Handle, "hipModuleGetGlobal"); @@ -189,6 +200,11 @@ static int hipGetSymbolAddress(void **devPtr, const void *symbol) { return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1; } +static int hipGetSymbolSize(size_t *size, const void *symbol) { + ensureHipLoaded(); + return pHipGetSymbolSize ? pHipGetSymbolSize(size, symbol) : -1; +} + static int hipMemcpy(void *dest, const void *src, size_t len, int kind /*2=DToH*/) { ensureHipLoaded(); @@ -471,7 +487,7 @@ extern "C" void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) { char TargetWithTU[64]; snprintf(TargetWithTU, sizeof(TargetWithTU), "%s.%d", ArchName, TUIndex); - if (processDeviceOffloadPrf(TU->DeviceVar, TUIndex, TargetWithTU) == 0) + if (processDeviceOffloadPrf(TU->DeviceVar, TargetWithTU, nullptr) == 0) TU->Processed = 1; else PROF_WARN("failed to process profile data for module %p TU %d\n", Ptr, @@ -505,23 +521,93 @@ static void **OffloadShadowVariables = nullptr; static int NumShadowVariables = 0; static int CapShadowVariables = 0; +struct OffloadSectionShadow { + void *Data; + void *Counters; + void *Names; +}; + +struct OffloadSectionShadowGroup { + OffloadSectionShadow *Shadows; + int NumShadows; + int CapShadows; + int NumSections; +}; + +static OffloadSectionShadowGroup *OffloadSectionShadowGroups = nullptr; +static int CapSectionShadowGroups = 0; + +static int ensureSectionShadowGroupCapacity(void) { + if (CapSectionShadowGroups >= CapShadowVariables) + return 0; + OffloadSectionShadowGroup *New = (OffloadSectionShadowGroup *)realloc( + OffloadSectionShadowGroups, CapShadowVariables * sizeof(*New)); + if (!New) + return -1; + __builtin_memset(New + CapSectionShadowGroups, 0, + (CapShadowVariables - CapSectionShadowGroups) * + sizeof(*New)); + OffloadSectionShadowGroups = New; + CapSectionShadowGroups = CapShadowVariables; + return 0; +} + +static int ensureSectionShadowCapacity(OffloadSectionShadowGroup *Group, + int MinCapacity) { + if (Group->CapShadows >= MinCapacity) + return 0; + int NewCap = Group->CapShadows ? Group->CapShadows * 2 : 4; + while (NewCap < MinCapacity) + NewCap *= 2; + OffloadSectionShadow *New = + (OffloadSectionShadow *)realloc(Group->Shadows, NewCap * sizeof(*New)); + if (!New) + return -1; + __builtin_memset(New + Group->CapShadows, 0, + (NewCap - Group->CapShadows) * sizeof(*New)); + Group->Shadows = New; + Group->CapShadows = NewCap; + return 0; +} + extern "C" void __llvm_profile_offload_register_shadow_variable(void *ptr) { if (growPtrArray(&OffloadShadowVariables, &NumShadowVariables, &CapShadowVariables, 64)) return; - OffloadShadowVariables[NumShadowVariables++] = ptr; + if (ensureSectionShadowGroupCapacity()) + return; + int Index = NumShadowVariables++; + OffloadShadowVariables[Index] = ptr; + __builtin_memset(&OffloadSectionShadowGroups[Index], 0, + sizeof(OffloadSectionShadowGroups[Index])); } -static void **OffloadSectionShadowVariables = nullptr; -static int NumSectionShadowVariables = 0; -static int CapSectionShadowVariables = 0; - extern "C" void __llvm_profile_offload_register_section_shadow_variable(void *ptr) { - if (growPtrArray(&OffloadSectionShadowVariables, &NumSectionShadowVariables, - &CapSectionShadowVariables, 64)) + if (NumShadowVariables == 0) return; - OffloadSectionShadowVariables[NumSectionShadowVariables++] = ptr; + + OffloadSectionShadowGroup *Group = + &OffloadSectionShadowGroups[NumShadowVariables - 1]; + int ShadowIndex = Group->NumSections / 3; + if (ensureSectionShadowCapacity(Group, ShadowIndex + 1)) + return; + if (ShadowIndex >= Group->NumShadows) + Group->NumShadows = ShadowIndex + 1; + + OffloadSectionShadow *Shadow = &Group->Shadows[ShadowIndex]; + switch (Group->NumSections % 3) { + case 0: + Shadow->Data = ptr; + break; + case 1: + Shadow->Counters = ptr; + break; + case 2: + Shadow->Names = ptr; + break; + } + ++Group->NumSections; } namespace { @@ -547,8 +633,41 @@ struct UniqueFree { } // namespace -static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, - const char *Target) { +static int getRegisteredSectionBounds(void *Shadow, void **DevicePtr, + size_t *Size) { + *DevicePtr = nullptr; + *Size = 0; + int AddrRc = hipGetSymbolAddress(DevicePtr, Shadow); + int SizeRc = hipGetSymbolSize(Size, Shadow); + return AddrRc == 0 && SizeRc == 0 && *DevicePtr && *Size > 0 ? 0 : -1; +} + +struct RegisteredSectionRange { + const void *Data; + const void *Counters; + const void *Names; + size_t DataSize; + size_t CountersSize; + size_t NamesSize; + size_t DataOffset; + size_t CountersOffset; + size_t NamesOffset; +}; + +static int +hasCompleteSectionShadows(const OffloadSectionShadowGroup *Sections) { + if (!Sections || Sections->NumShadows == 0 || Sections->NumSections % 3 != 0) + return 0; + for (int I = 0; I < Sections->NumShadows; ++I) { + if (!Sections->Shadows[I].Data || !Sections->Shadows[I].Counters || + !Sections->Shadows[I].Names) + return 0; + } + return 1; +} + +static int processDeviceOffloadPrf(void *DeviceOffloadPrf, const char *Target, + const OffloadSectionShadowGroup *Sections) { __llvm_profile_gpu_sections HostSections; if (hipMemcpy(&HostSections, DeviceOffloadPrf, sizeof(HostSections), @@ -568,6 +687,10 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, size_t DataSize = (const char *)DevDataEnd - (const char *)DevDataBegin; size_t NamesSize = (const char *)DevNamesEnd - (const char *)DevNamesBegin; + int UseRegisteredSections = hasCompleteSectionShadows(Sections); + RegisteredSectionRange *RegisteredRanges = nullptr; + int NumRegisteredRanges = 0; + if (isVerboseMode()) PROF_NOTE("Section pointers: Cnts=[%p,%p]=%zu Data=[%p,%p]=%zu " "Names=[%p,%p]=%zu\n", @@ -599,78 +722,180 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, static size_t CachedDataSize = 0; // Owns freshly malloc'd buffers; release() transfers ownership to the cache. - UniqueFree CntsOwner, DataOwner, NamesOwner; + UniqueFree CntsOwner, DataOwner, NamesOwner, RegisteredRangeOwner; + + if (UseRegisteredSections) { + NumRegisteredRanges = Sections->NumShadows; + RegisteredRangeOwner.reset( + malloc(NumRegisteredRanges * sizeof(RegisteredSectionRange))); + RegisteredRanges = (RegisteredSectionRange *)RegisteredRangeOwner.get(); + if (!RegisteredRanges) { + PROF_ERR("%s\n", "failed to allocate registered section table"); + return -1; + } + __builtin_memset(RegisteredRanges, 0, + NumRegisteredRanges * sizeof(*RegisteredRanges)); + + size_t RegisteredDataSize = 0; + size_t RegisteredCountersSize = 0; + size_t RegisteredNamesSize = 0; + for (int I = 0; I < NumRegisteredRanges; ++I) { + void *Data = nullptr; + void *Counters = nullptr; + void *Names = nullptr; + size_t ThisDataSize = 0; + size_t ThisCountersSize = 0; + size_t ThisNamesSize = 0; + OffloadSectionShadow *Shadow = &Sections->Shadows[I]; + if (getRegisteredSectionBounds(Shadow->Data, &Data, &ThisDataSize) != 0 || + getRegisteredSectionBounds(Shadow->Counters, &Counters, + &ThisCountersSize) != 0 || + getRegisteredSectionBounds(Shadow->Names, &Names, &ThisNamesSize) != + 0) { + PROF_ERR("%s\n", "failed to get registered section bounds"); + return -1; + } - if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin && - CountersSize == CachedCntsSize) { - HostCountersBegin = CachedHostCnts; - CntsReused = 1; - if (isVerboseMode()) - PROF_NOTE("Reusing cached counters section (%zu bytes)\n", CountersSize); - } else if (CountersSize > 0) { - HostCountersBegin = (char *)malloc(CountersSize); - CntsOwner.reset(HostCountersBegin); - } + RegisteredRanges[I].Data = Data; + RegisteredRanges[I].Counters = Counters; + RegisteredRanges[I].Names = Names; + RegisteredRanges[I].DataSize = ThisDataSize; + RegisteredRanges[I].CountersSize = ThisCountersSize; + RegisteredRanges[I].NamesSize = ThisNamesSize; + RegisteredRanges[I].DataOffset = RegisteredDataSize; + RegisteredRanges[I].CountersOffset = RegisteredCountersSize; + RegisteredDataSize += ThisDataSize; + RegisteredCountersSize += ThisCountersSize; + + int ReuseNames = 0; + for (int J = 0; J < I; ++J) { + if (RegisteredRanges[J].Names == Names && + RegisteredRanges[J].NamesSize == ThisNamesSize) { + RegisteredRanges[I].NamesOffset = RegisteredRanges[J].NamesOffset; + ReuseNames = 1; + break; + } + } + if (!ReuseNames) { + RegisteredRanges[I].NamesOffset = RegisteredNamesSize; + RegisteredNamesSize += ThisNamesSize; + } + } - if (DataSize > 0 && DevDataBegin == CachedDevDataBegin && - DataSize == CachedDataSize) { - HostDataBegin = CachedHostData; - DataReused = 1; - if (isVerboseMode()) - PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize); - } else if (DataSize > 0) { + DataSize = RegisteredDataSize; + CountersSize = RegisteredCountersSize; + NamesSize = RegisteredNamesSize; HostDataBegin = (char *)malloc(DataSize); + HostCountersBegin = (char *)malloc(CountersSize); + HostNamesBegin = NamesSize ? (char *)malloc(NamesSize) : nullptr; DataOwner.reset(HostDataBegin); - } - - if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin && - NamesSize == CachedNamesSize) { - HostNamesBegin = CachedHostNames; - NamesReused = 1; - if (isVerboseMode()) - PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize); - } else if (NamesSize > 0) { - HostNamesBegin = (char *)malloc(NamesSize); + CntsOwner.reset(HostCountersBegin); NamesOwner.reset(HostNamesBegin); - } + if ((DataSize > 0 && !HostDataBegin) || + (CountersSize > 0 && !HostCountersBegin) || + (NamesSize > 0 && !HostNamesBegin)) { + PROF_ERR("%s\n", "failed to allocate host memory for device sections"); + return -1; + } - if ((DataSize > 0 && !HostDataBegin) || - (CountersSize > 0 && !HostCountersBegin) || - (NamesSize > 0 && !HostNamesBegin)) { - PROF_ERR("%s\n", "failed to allocate host memory for device sections"); - return -1; - } + for (int I = 0; I < NumRegisteredRanges; ++I) { + RegisteredSectionRange *R = &RegisteredRanges[I]; + if (memcpyDeviceToHost(HostDataBegin + R->DataOffset, R->Data, + R->DataSize) != 0 || + memcpyDeviceToHost(HostCountersBegin + R->CountersOffset, R->Counters, + R->CountersSize) != 0) { + PROF_ERR("%s\n", "failed to copy profile sections from device"); + return -1; + } - if ((DataSize > 0 && !DataReused && - memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) || - (CountersSize > 0 && !CntsReused && - memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) != - 0) || - (NamesSize > 0 && !NamesReused && - memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) { - PROF_ERR("%s\n", "failed to copy profile sections from device"); - return -1; - } + int CopyNames = 1; + for (int J = 0; J < I; ++J) { + if (RegisteredRanges[J].Names == R->Names && + RegisteredRanges[J].NamesSize == R->NamesSize) { + CopyNames = 0; + break; + } + } + if (CopyNames && R->NamesSize > 0 && + memcpyDeviceToHost(HostNamesBegin + R->NamesOffset, R->Names, + R->NamesSize) != 0) { + PROF_ERR("%s\n", "failed to copy profile sections from device"); + return -1; + } + } + } else { + if (CountersSize > 0 && DevCntsBegin == CachedDevCntsBegin && + CountersSize == CachedCntsSize) { + HostCountersBegin = CachedHostCnts; + CntsReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached counters section (%zu bytes)\n", + CountersSize); + } else if (CountersSize > 0) { + HostCountersBegin = (char *)malloc(CountersSize); + CntsOwner.reset(HostCountersBegin); + } - /* Cache buffers so RDC-mode multi-shadow drains can reuse them. - * release() prevents the scope guards from freeing what the cache owns. */ - if (!CntsReused && CountersSize > 0) { - CachedDevCntsBegin = DevCntsBegin; - CachedHostCnts = HostCountersBegin; - CachedCntsSize = CountersSize; - CntsOwner.release(); - } - if (!DataReused && DataSize > 0) { - CachedDevDataBegin = DevDataBegin; - CachedHostData = HostDataBegin; - CachedDataSize = DataSize; - DataOwner.release(); - } - if (!NamesReused && NamesSize > 0) { - CachedDevNamesBegin = DevNamesBegin; - CachedHostNames = HostNamesBegin; - CachedNamesSize = NamesSize; - NamesOwner.release(); + if (DataSize > 0 && DevDataBegin == CachedDevDataBegin && + DataSize == CachedDataSize) { + HostDataBegin = CachedHostData; + DataReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached data section (%zu bytes)\n", DataSize); + } else if (DataSize > 0) { + HostDataBegin = (char *)malloc(DataSize); + DataOwner.reset(HostDataBegin); + } + + if (NamesSize > 0 && DevNamesBegin == CachedDevNamesBegin && + NamesSize == CachedNamesSize) { + HostNamesBegin = CachedHostNames; + NamesReused = 1; + if (isVerboseMode()) + PROF_NOTE("Reusing cached names section (%zu bytes)\n", NamesSize); + } else if (NamesSize > 0) { + HostNamesBegin = (char *)malloc(NamesSize); + NamesOwner.reset(HostNamesBegin); + } + + if ((DataSize > 0 && !HostDataBegin) || + (CountersSize > 0 && !HostCountersBegin) || + (NamesSize > 0 && !HostNamesBegin)) { + PROF_ERR("%s\n", "failed to allocate host memory for device sections"); + return -1; + } + + if ((DataSize > 0 && !DataReused && + memcpyDeviceToHost(HostDataBegin, DevDataBegin, DataSize) != 0) || + (CountersSize > 0 && !CntsReused && + memcpyDeviceToHost(HostCountersBegin, DevCntsBegin, CountersSize) != + 0) || + (NamesSize > 0 && !NamesReused && + memcpyDeviceToHost(HostNamesBegin, DevNamesBegin, NamesSize) != 0)) { + PROF_ERR("%s\n", "failed to copy profile sections from device"); + return -1; + } + + /* Cache buffers so RDC-mode multi-shadow drains can reuse them. + * release() prevents the scope guards from freeing what the cache owns. */ + if (!CntsReused && CountersSize > 0) { + CachedDevCntsBegin = DevCntsBegin; + CachedHostCnts = HostCountersBegin; + CachedCntsSize = CountersSize; + CntsOwner.release(); + } + if (!DataReused && DataSize > 0) { + CachedDevDataBegin = DevDataBegin; + CachedHostData = HostDataBegin; + CachedDataSize = DataSize; + DataOwner.release(); + } + if (!NamesReused && NamesSize > 0) { + CachedDevNamesBegin = DevNamesBegin; + CachedHostNames = HostNamesBegin; + CachedNamesSize = NamesSize; + NamesOwner.release(); + } } if (isVerboseMode()) @@ -721,16 +946,38 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, for (uint64_t i = 0; i < NumData; ++i) { if (RelocatedData[i].CounterPtr) { ptrdiff_t DeviceCounterPtrOffset = (ptrdiff_t)RelocatedData[i].CounterPtr; - const char *DeviceDataStructAddr = - (const char *)DevDataBegin + (i * sizeof(__llvm_profile_data)); + size_t DataRecordOffset = i * sizeof(__llvm_profile_data); + const char *RangeDevDataBegin = (const char *)DevDataBegin; + const char *RangeDevCountersBegin = (const char *)DevCntsBegin; + size_t RangeCountersOffset = 0; + if (UseRegisteredSections) { + int FoundRange = 0; + for (int R = 0; R < NumRegisteredRanges; ++R) { + RegisteredSectionRange *Range = &RegisteredRanges[R]; + if (DataRecordOffset < Range->DataOffset || + DataRecordOffset >= Range->DataOffset + Range->DataSize) + continue; + RangeDevDataBegin = (const char *)Range->Data; + RangeDevCountersBegin = (const char *)Range->Counters; + RangeCountersOffset = Range->CountersOffset; + DataRecordOffset -= Range->DataOffset; + FoundRange = 1; + break; + } + if (!FoundRange) { + PROF_ERR("%s\n", "failed to locate profile data record range"); + return -1; + } + } + const char *DeviceDataStructAddr = RangeDevDataBegin + DataRecordOffset; const char *DeviceCountersAddr = DeviceDataStructAddr + DeviceCounterPtrOffset; ptrdiff_t OffsetIntoCountersSection = - DeviceCountersAddr - (const char *)DevCntsBegin; + DeviceCountersAddr - RangeDevCountersBegin; - ptrdiff_t NewRelativeOffset = DataSize + PaddingBytesBeforeCounters + - OffsetIntoCountersSection - - (i * sizeof(__llvm_profile_data)); + ptrdiff_t NewRelativeOffset = + DataSize + PaddingBytesBeforeCounters + RangeCountersOffset + + OffsetIntoCountersSection - (i * sizeof(__llvm_profile_data)); __builtin_memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) + offsetof(__llvm_profile_data, CounterPtr), &NewRelativeOffset, sizeof(NewRelativeOffset)); @@ -743,9 +990,6 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, sizeof(RelocatedData[i].Values)); } - /* Target already encodes TUIndex when needed. */ - (void)TUIndex; - ret = __llvm_write_custom_profile( Target, (__llvm_profile_data *)BufDataBegin, (__llvm_profile_data *)(BufDataBegin + DataSize), BufCountersBegin, @@ -761,8 +1005,8 @@ static int processDeviceOffloadPrf(void *DeviceOffloadPrf, int TUIndex, return ret; } -static int processShadowVariable(void *ShadowVar, int TUIndex, - const char *Target) { +static int processShadowVariable(int Index, const char *Target) { + void *ShadowVar = OffloadShadowVariables[Index]; void *DeviceSections = nullptr; if (hipGetSymbolAddress(&DeviceSections, ShadowVar) != 0) { PROF_WARN("failed to get symbol address for shadow variable %p\n", @@ -770,7 +1014,12 @@ static int processShadowVariable(void *ShadowVar, int TUIndex, return -1; } /* DeviceSections points at the per-TU sections struct itself. */ - return processDeviceOffloadPrf(DeviceSections, TUIndex, Target); + const OffloadSectionShadowGroup *Sections = nullptr; + if (Index < CapSectionShadowGroups) + Sections = &OffloadSectionShadowGroups[Index]; + if (!hasCompleteSectionShadows(Sections)) + return 0; + return processDeviceOffloadPrf(DeviceSections, Target, Sections); } static int isHipAvailable(void) { @@ -815,7 +1064,7 @@ extern "C" int __llvm_profile_hip_collect_device_data(void) { snprintf(TargetWithIdx, sizeof(TargetWithIdx), "%s.%d", ArchName, i); Target = TargetWithIdx; } - if (processShadowVariable(OffloadShadowVariables[i], i, Target) != 0) + if (processShadowVariable(i, Target) != 0) Ret = -1; } } diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 90471e910bb50..0138e310304fa 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -152,6 +152,10 @@ inline StringRef getInstrProfVNodesVarName() { return "__llvm_prf_vnodes"; } /// of all function's PGO names. inline StringRef getInstrProfNamesVarName() { return "__llvm_prf_nm"; } +inline StringRef getInstrProfNamesVarPostfixVarName() { + return "__llvm_prf_nm_postfix"; +} + inline StringRef getInstrProfVTableNamesVarName() { return "__llvm_prf_vnm"; } /// Return the name of a covarage mapping variable (internal linkage) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 8e4ba41919768..6f6ad89126024 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1984,6 +1984,50 @@ void InstrLowerer::emitVNodes() { UsedVars.push_back(VNodesVar); } +// Build the per-TU device-PGO sections struct: section start/stop bounds for +// names/counters/data plus the raw version. Returns null if it already exists. +static GlobalVariable *emitGPUOffloadSectionsStruct(Module &M, + StringRef CUIDPostfix) { + std::string Name = ("__llvm_profile_sections" + CUIDPostfix).str(); + if (M.getNamedValue(Name)) + return nullptr; + + LLVMContext &Ctx = M.getContext(); + unsigned AS = M.getDataLayout().getDefaultGlobalsAddressSpace(); + auto Extern = [&](StringRef Sym, Type *Ty, bool IsConst, + GlobalValue::VisibilityTypes Vis) { + GlobalVariable *GV = M.getNamedGlobal(Sym); + if (!GV) { + GV = new GlobalVariable(M, Ty, IsConst, GlobalValue::ExternalLinkage, + nullptr, Sym, nullptr, + GlobalValue::NotThreadLocal, AS); + GV->setVisibility(Vis); + } + return GV; + }; + // Section bounds are hidden i8 markers; raw_version is an i64 constant. + auto *I8 = Type::getInt8Ty(Ctx); + auto Hidden = GlobalValue::HiddenVisibility; + Constant *Fields[] = {Extern("__start___llvm_prf_names", I8, false, Hidden), + Extern("__stop___llvm_prf_names", I8, false, Hidden), + Extern("__start___llvm_prf_cnts", I8, false, Hidden), + Extern("__stop___llvm_prf_cnts", I8, false, Hidden), + Extern("__start___llvm_prf_data", I8, false, Hidden), + Extern("__stop___llvm_prf_data", I8, false, Hidden), + Extern("__llvm_profile_raw_version", + Type::getInt64Ty(Ctx), true, + GlobalValue::DefaultVisibility)}; + auto *PtrTy = PointerType::get(Ctx, AS); + auto *STy = + StructType::get(Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); + auto *GV = new GlobalVariable(M, STy, /*isConstant=*/true, + GlobalValue::ExternalLinkage, + ConstantStruct::get(STy, Fields), Name, nullptr, + GlobalValue::NotThreadLocal, AS); + GV->setVisibility(GlobalValue::ProtectedVisibility); + return GV; +} + void InstrLowerer::emitNameData() { if (ReferencedNames.empty()) return; @@ -1998,9 +2042,28 @@ void InstrLowerer::emitNameData() { auto *NamesVal = ConstantDataArray::getString(Ctx, StringRef(CompressedNameStr), false); std::string NamesVarName = std::string(getInstrProfNamesVarName()); - NamesVar = - new GlobalVariable(M, NamesVal->getType(), true, - GlobalValue::PrivateLinkage, NamesVal, NamesVarName); + GlobalValue::LinkageTypes NamesLinkage = GlobalValue::PrivateLinkage; + GlobalValue::VisibilityTypes NamesVisibility = GlobalValue::DefaultVisibility; + std::string GPUCUIDPostfix; + if (isGPUProfTarget(M)) { + if (auto *GV = M.getNamedGlobal(getInstrProfNamesVarPostfixVarName())) { + if (auto *Init = + dyn_cast_or_null(GV->getInitializer())) { + if (Init->isCString()) { + GPUCUIDPostfix = Init->getAsCString().str(); + NamesVarName += GPUCUIDPostfix; + NamesLinkage = GlobalValue::ExternalLinkage; + NamesVisibility = GlobalValue::ProtectedVisibility; + removeFromUsedLists( + M, [GV](Constant *C) { return C->stripPointerCasts() == GV; }); + GV->eraseFromParent(); + } + } + } + } + NamesVar = new GlobalVariable(M, NamesVal->getType(), true, NamesLinkage, + NamesVal, NamesVarName); + NamesVar->setVisibility(NamesVisibility); NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); @@ -2019,6 +2082,14 @@ void InstrLowerer::emitNameData() { for (auto *NamePtr : ReferencedNames) NamePtr->eraseFromParent(); + + // Emit the device sections struct only when this TU produced profile data, so + // its section start/stop references are backed by a real section. + bool HasData = llvm::any_of(ProfileDataMap, + [](const auto &KV) { return KV.second.DataVar; }); + if (!GPUCUIDPostfix.empty() && HasData) + if (GlobalVariable *GV = emitGPUOffloadSectionsStruct(M, GPUCUIDPostfix)) + CompilerUsedVars.push_back(GV); } void InstrLowerer::emitVTableNames() {