diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index 46d6bb5bd8896..7525feab8f133 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -142,6 +142,38 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \ #undef INSTR_PROF_VALUE_NODE /* INSTR_PROF_VALUE_NODE end. */ +/* INSTR_PROF_GPU_SECT start. */ +/* Fields of the GPU profile section bounds structure, populated by the + * compiler runtime and read by the host to extract profiling data. */ +#ifndef INSTR_PROF_GPU_SECT +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_DATA_DEFINED +#endif +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx), \ + VersionVar, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +#undef INSTR_PROF_GPU_SECT +/* INSTR_PROF_GPU_SECT end. */ + /* INSTR_PROF_RAW_HEADER start */ /* Definition of member fields of the raw profile header data structure. */ /* Please update llvm/docs/InstrProfileFormat.rst as appropriate when updating @@ -761,6 +793,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, * specified via command line. */ #define INSTR_PROF_PROFILE_NAME_VAR __llvm_profile_filename +/* GPU profiling section bounds structure, populated by the compiler runtime + * and read by the host to extract profiling data. */ +#define INSTR_PROF_SECT_BOUNDS_TABLE __llvm_profile_sections + /* section name strings common to all targets other than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 54013d7e6568d..1d22934bd6ef1 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -57,6 +57,11 @@ typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData { #include "profile/InstrProfData.inc" } VTableProfData; +typedef struct __llvm_profile_gpu_sections { +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) Type Name; +#include "profile/InstrProfData.inc" +} __llvm_profile_gpu_sections; + typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) __llvm_gcov_init_func_struct { #define COVINIT_FUNC(Type, LLVMType, Name, Initializer) Type Name; diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c index 78bf512f8c44f..ab7031343c855 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c @@ -17,6 +17,9 @@ #include "InstrProfiling.h" #include +// Symbols exported to the GPU runtime need to be visible in the .dynsym table. +#define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected"))) + // Indicates that the current wave is fully occupied. static int is_uniform(uint64_t mask) { const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes()); @@ -39,4 +42,45 @@ COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter, } } +#if defined(__AMDGPU__) + +#define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON) +#define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON) +#define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON) +#define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON) +#define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON) +#define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON) + +extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY + COMPILER_RT_WEAK; +extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY + COMPILER_RT_WEAK; + +// AMDGPU is a proper ELF target and exports the linker-defined section bounds. +COMPILER_RT_GPU_VISIBILITY +__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = { + PROF_NAME_START, + PROF_NAME_STOP, + PROF_CNTS_START, + PROF_CNTS_STOP, + PROF_DATA_START, + PROF_DATA_STOP, + &INSTR_PROF_RAW_VERSION_VAR}; + +#elif defined(__NVPTX__) + +// NVPTX supports neither sections nor ELF symbols, we rely on the handling in +// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time. +// FIXME: This will not work until we make the NVPTX backend emit section +// globals next to each other. +COMPILER_RT_GPU_VISIBILITY +__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = { + NULL, NULL, NULL, NULL, NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR}; + +#endif + #endif diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c index acdb222004fd4..7a22be6bb5861 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c @@ -23,7 +23,7 @@ #if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \ defined(_AIX) || defined(__wasm__) || defined(__HAIKU__) || \ - defined(COMPILER_RT_PROFILE_BAREMETAL) + (defined(COMPILER_RT_PROFILE_BAREMETAL) && !defined(__NVPTX__)) #if !defined(_AIX) && !defined(__wasm__) && \ !defined(COMPILER_RT_PROFILE_BAREMETAL) diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c index f5d1c74f10115..205bba1060c3b 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c @@ -13,28 +13,38 @@ // This implementation expects the compiler instrumentation pass to define a // constructor in each file which calls into this file. -#if !defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__) && \ - !defined(__Fuchsia__) && !(defined(__sun__) && defined(__svr4__)) && \ - !defined(__NetBSD__) && !defined(_WIN32) && !defined(_AIX) && \ - !defined(__wasm__) && !defined(__HAIKU__) && \ - !defined(COMPILER_RT_PROFILE_BAREMETAL) - -#include -#include +#if (!defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__) && \ + !defined(__Fuchsia__) && !(defined(__sun__) && defined(__svr4__)) && \ + !defined(__NetBSD__) && !defined(_WIN32) && !defined(_AIX) && \ + !defined(__wasm__) && !defined(__HAIKU__) && \ + !defined(COMPILER_RT_PROFILE_BAREMETAL)) || \ + defined(__NVPTX__) #include "InstrProfiling.h" #include "InstrProfilingInternal.h" +#if defined(__NVPTX__) +extern __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE; +#define DataFirst INSTR_PROF_SECT_BOUNDS_TABLE.DataStart +#define DataLast INSTR_PROF_SECT_BOUNDS_TABLE.DataStop +#define NamesFirst INSTR_PROF_SECT_BOUNDS_TABLE.NamesStart +#define NamesLast INSTR_PROF_SECT_BOUNDS_TABLE.NamesStop +#define CountersFirst INSTR_PROF_SECT_BOUNDS_TABLE.CountersStart +#define CountersLast INSTR_PROF_SECT_BOUNDS_TABLE.CountersStop +#else static const __llvm_profile_data *DataFirst = NULL; static const __llvm_profile_data *DataLast = NULL; -static const VTableProfData *VTableProfDataFirst = NULL; -static const VTableProfData *VTableProfDataLast = NULL; static const char *NamesFirst = NULL; static const char *NamesLast = NULL; -static const char *VNamesFirst = NULL; -static const char *VNamesLast = NULL; static char *CountersFirst = NULL; static char *CountersLast = NULL; +#endif +static const VTableProfData *VTableProfDataFirst = NULL; +static const VTableProfData *VTableProfDataLast = NULL; +static const char *VNamesFirst = NULL; +static const char *VNamesLast = NULL; +static char *BitmapFirst = NULL; +static char *BitmapLast = NULL; static const void *getMinAddr(const void *A1, const void *A2) { return A1 < A2 ? A1 : A2; @@ -55,6 +65,19 @@ COMPILER_RT_VISIBILITY void __llvm_profile_register_function(void *Data_) { /* TODO: Only emit this function if we can't use linker magic. */ const __llvm_profile_data *Data = (__llvm_profile_data *)Data_; + +#if defined(__NVPTX__) + // NVPTX stores absolute counter addresses to avoid circular dependencies in + // PTX global variable initializers. Convert to a relative offset so the + // host-side profile reader sees the standard format. + { + uintptr_t Rel = (uintptr_t)Data->CounterPtr - (uintptr_t)Data_; + __builtin_memcpy((char *)Data_ + + __builtin_offsetof(__llvm_profile_data, CounterPtr), + &Rel, sizeof(Rel)); + } +#endif + if (!DataFirst) { DataFirst = Data; DataLast = Data + 1; diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index 46d6bb5bd8896..7525feab8f133 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -142,6 +142,38 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \ #undef INSTR_PROF_VALUE_NODE /* INSTR_PROF_VALUE_NODE end. */ +/* INSTR_PROF_GPU_SECT start. */ +/* Fields of the GPU profile section bounds structure, populated by the + * compiler runtime and read by the host to extract profiling data. */ +#ifndef INSTR_PROF_GPU_SECT +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_DATA_DEFINED +#endif +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx), \ + VersionVar, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +#undef INSTR_PROF_GPU_SECT +/* INSTR_PROF_GPU_SECT end. */ + /* INSTR_PROF_RAW_HEADER start */ /* Definition of member fields of the raw profile header data structure. */ /* Please update llvm/docs/InstrProfileFormat.rst as appropriate when updating @@ -761,6 +793,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, * specified via command line. */ #define INSTR_PROF_PROFILE_NAME_VAR __llvm_profile_filename +/* GPU profiling section bounds structure, populated by the compiler runtime + * and read by the host to extract profiling data. */ +#define INSTR_PROF_SECT_BOUNDS_TABLE __llvm_profile_sections + /* section name strings common to all targets other than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 82469481881c0..b96db851fa6bd 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -486,25 +486,18 @@ bool isGPUProfTarget(const Module &M) { } void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) { - // If the target is a GPU, make the symbol protected so it can - // be read from the host device - if (isGPUProfTarget(M)) - FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility); // Hide the symbol so that we correctly get a copy for each executable. - else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); } GlobalVariable *createPGOFuncNameVar(Module &M, GlobalValue::LinkageTypes Linkage, StringRef PGOFuncName) { - // Ensure profiling variables on GPU are visible to be read from host - if (isGPUProfTarget(M)) - Linkage = GlobalValue::ExternalLinkage; // We generally want to match the function's linkage, but available_externally // and extern_weak both have the wrong semantics, and anything that doesn't // need to link across compilation units doesn't need to be visible at all. - else if (Linkage == GlobalValue::ExternalWeakLinkage) + if (Linkage == GlobalValue::ExternalWeakLinkage) Linkage = GlobalValue::LinkOnceAnyLinkage; else if (Linkage == GlobalValue::AvailableExternallyLinkage) Linkage = GlobalValue::LinkOnceODRLinkage; diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index d1696f4afbe36..2ae6c68313508 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1425,6 +1425,10 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) { } static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { + // NVPTX is an ELF target but PTX does not expose sections or linker symbols. + if (TT.isNVPTX()) + return true; + // compiler-rt uses linker support to get data/counters/name start/end for // ELF, COFF, Mach-O, XCOFF, and Wasm. if (TT.isOSBinFormatELF() || TT.isOSBinFormatCOFF() || @@ -1815,10 +1819,6 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); - if (isGPUProfTarget(M)) { - Linkage = GlobalValue::ExternalLinkage; - Visibility = GlobalValue::ProtectedVisibility; - } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1830,9 +1830,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && - (TT.isOSBinFormatELF() || - (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { + if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + (TT.isOSBinFormatELF() || + (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } @@ -1849,6 +1849,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy); if (BitmapPtr != nullptr) RelativeBitmapPtr = ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy); + } else if (TT.isNVPTX()) { + // The NVPTX target cannot handle self-referencing constant expressions in + // global initializers at all. Use absolute pointers and have the runtime + // registration convert them to relative offsets. + DataSectionKind = IPSK_data; + RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy); } else { // Reference the counter variable with a label difference (link-time // constant). @@ -1953,10 +1959,6 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); - if (isGPUProfTarget(M)) { - NamesVar->setLinkage(GlobalValue::ExternalLinkage); - NamesVar->setVisibility(GlobalValue::ProtectedVisibility); - } NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); @@ -2048,6 +2050,11 @@ void InstrLowerer::emitRegistration() { } bool InstrLowerer::emitRuntimeHook() { + // GPU profiling data is read directly by the host offload runtime. We do not + // need the standard runtime hook. + if (TT.isGPU()) + return false; + // We expect the linker to be invoked with -u flag for Linux // in which case there is no need to emit the external variable. if (TT.isOSLinux() || TT.isOSAIX()) @@ -2062,10 +2069,7 @@ bool InstrLowerer::emitRuntimeHook() { auto *Var = new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, getInstrProfRuntimeHookVarName()); - if (isGPUProfTarget(M)) - Var->setVisibility(GlobalValue::ProtectedVisibility); - else - Var->setVisibility(GlobalValue::HiddenVisibility); + Var->setVisibility(GlobalValue::HiddenVisibility); if (TT.isOSBinFormatELF() && !TT.isPS()) { // Mark the user variable as used so that it isn't stripped out. diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 0232d45e5b7bb..db032d6fcad45 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -469,9 +469,6 @@ createIRLevelProfileFlagVar(Module &M, M, IntTy64, true, GlobalValue::WeakAnyLinkage, Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName); IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility); - if (isGPUProfTarget(M)) - IRLevelVersionVariable->setVisibility( - llvm::GlobalValue::ProtectedVisibility); Triple TT(M.getTargetTriple()); if (TT.supportsCOMDAT()) { diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index af7dac66ca85d..529d697d355b2 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -65,6 +65,12 @@ struct __llvm_profile_data { #include "llvm/ProfileData/InstrProfData.inc" }; +struct __llvm_profile_gpu_sections { +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) \ + std::remove_const::type Name; +#include "llvm/ProfileData/InstrProfData.inc" +}; + extern "C" { extern int __attribute__((weak)) __llvm_write_custom_profile( const char *Target, const __llvm_profile_data *DataBegin, @@ -72,11 +78,11 @@ extern int __attribute__((weak)) __llvm_write_custom_profile( const char *CountersEnd, const char *NamesBegin, const char *NamesEnd, const uint64_t *VersionOverride); } -/// PGO profiling data extracted from a GPU device +/// PGO profiling data extracted from a GPU device via __llvm_profile_sections. struct GPUProfGlobals { - SmallVector Counts; - SmallVector<__llvm_profile_data> Data; - SmallVector NamesData; + SmallVector NamesSection; + SmallVector CountersSection; + SmallVector DataSection; Triple TargetTriple; uint64_t Version = INSTR_PROF_RAW_VERSION; diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index b92c606d14da1..a5edc663062a1 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -16,6 +16,7 @@ #include "Shared/Utils.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/InstrProfData.inc" #include "llvm/Support/Error.h" @@ -179,67 +180,63 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, Expected GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image) { - GPUProfGlobals DeviceProfileData; + const char *TableName = INSTR_PROF_QUOTE(INSTR_PROF_SECT_BOUNDS_TABLE); + if (!isSymbolInImage(Device, Image, TableName)) + return GPUProfGlobals{}; + + GPUProfGlobals ProfData; auto ObjFile = getELFObjectFile(Image); if (!ObjFile) return ObjFile.takeError(); std::unique_ptr ELFObj( static_cast(ObjFile->release())); - DeviceProfileData.TargetTriple = ELFObj->makeTriple(); - - // Iterate through elf symbols - for (auto &Sym : ELFObj->symbols()) { - auto NameOrErr = Sym.getName(); - if (!NameOrErr) - return NameOrErr.takeError(); - - // Check if given current global is a profiling global based - // on name - if (*NameOrErr == getInstrProfNamesVarName()) { - // Read in profiled function names from ELF - auto SectionOrErr = Sym.getSection(); - if (!SectionOrErr) - return SectionOrErr.takeError(); - - auto ContentsOrErr = (*SectionOrErr)->getContents(); - if (!ContentsOrErr) - return ContentsOrErr.takeError(); - - SmallVector NameBytes(ContentsOrErr->bytes()); - DeviceProfileData.NamesData = NameBytes; - } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) { - // Read global variable profiling counts - SmallVector Counts(Sym.getSize() / sizeof(int64_t), 0); - GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data()); - if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) - return Err; - DeviceProfileData.Counts.append(std::move(Counts)); - } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) { - // Read profiling data for this global variable - __llvm_profile_data Data{}; - GlobalTy DataGlobal(NameOrErr->str(), Sym.getSize(), &Data); - if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) - return Err; - DeviceProfileData.Data.push_back(std::move(Data)); - } else if (*NameOrErr == INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)) { - uint64_t RawVersionData; - GlobalTy RawVersionGlobal(NameOrErr->str(), Sym.getSize(), - &RawVersionData); - if (auto Err = readGlobalFromDevice(Device, Image, RawVersionGlobal)) - return Err; - DeviceProfileData.Version = RawVersionData; - } - } - return DeviceProfileData; + ProfData.TargetTriple = ELFObj->makeTriple(); + + __llvm_profile_gpu_sections Table = {}; + GlobalTy TableGlobal(TableName, sizeof(Table), &Table); + if (auto Err = readGlobalFromDevice(Device, Image, TableGlobal)) + return Err; + + // Read the contiguous data from one of the profiling sections on the device. + auto ReadSection = [&](const void *Start, const void *Stop, + SmallVector &Out) -> Error { + uintptr_t Begin = reinterpret_cast(Start); + uintptr_t End = reinterpret_cast(Stop); + size_t Size = End - Begin; + Out.resize_for_overwrite(Size); + return Size ? Device.dataRetrieve(Out.data(), Start, Size, + /*AsyncInfo=*/nullptr) + : Error::success(); + }; + + if (auto Err = + ReadSection(Table.NamesStart, Table.NamesStop, ProfData.NamesSection)) + return Err; + if (auto Err = ReadSection(Table.CountersStart, Table.CountersStop, + ProfData.CountersSection)) + return Err; + if (auto Err = + ReadSection(Table.DataStart, Table.DataStop, ProfData.DataSection)) + return Err; + + // Get the profiling version from the device. + if (auto Err = Device.dataRetrieve(&ProfData.Version, Table.VersionVar, + sizeof(uint64_t), + /*AsyncInfo=*/nullptr)) + return Err; + + return ProfData; } void GPUProfGlobals::dump() const { outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str() << "\n"; - outs() << "======== Counters =========\n"; - for (size_t i = 0; i < Counts.size(); i++) { + size_t NumCounters = CountersSection.size() / sizeof(int64_t); + outs() << "======== Counters (" << NumCounters << ") =========\n"; + auto *Counts = reinterpret_cast(CountersSection.data()); + for (size_t i = 0; i < NumCounters; i++) { if (i > 0 && i % 10 == 0) outs() << "\n"; else if (i != 0) @@ -248,33 +245,14 @@ void GPUProfGlobals::dump() const { } outs() << "\n"; - outs() << "========== Data ===========\n"; - for (const auto &ProfData : Data) { - outs() << "{ "; -// The ProfData.Name maybe array, eg: NumValueSites[IPVK_Last+1] . -// If we print out it directly, we are accessing out of bound data. -// Skip dumping the array for now. -#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ - if (sizeof(#Name) > 2 && #Name[sizeof(#Name) - 2] == ']') { \ - outs() << "[...] "; \ - } else { \ - outs() << ProfData.Name << " "; \ - } -#include "llvm/ProfileData/InstrProfData.inc" - outs() << "}\n"; - } + size_t NumDataEntries = DataSection.size() / sizeof(__llvm_profile_data); + outs() << "========== Data (" << NumDataEntries << ") ===========\n"; outs() << "======== Functions ========\n"; - std::string s; - s.reserve(NamesData.size()); - for (uint8_t Name : NamesData) { - s.push_back((char)Name); - } - InstrProfSymtab Symtab; - if (Error Err = Symtab.create(StringRef(s))) { + if (Error Err = + Symtab.create(StringRef(NamesSection.data(), NamesSection.size()))) consumeError(std::move(Err)); - } Symtab.dumpNames(outs()); outs() << "===========================\n"; } @@ -286,35 +264,27 @@ Error GPUProfGlobals::write() const { "The compiler-rt profiling library must be linked for " "GPU PGO to work."); - size_t DataSize = Data.size() * sizeof(__llvm_profile_data), - CountsSize = Counts.size() * sizeof(int64_t); - __llvm_profile_data *DataBegin, *DataEnd; - char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd; - - // Initialize array of contiguous data. We need to make sure each section is - // contiguous so that the PGO library can compute deltas properly - SmallVector ContiguousData(NamesData.size() + DataSize + CountsSize); - - // Compute region pointers - DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize); - DataEnd = - (__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize); - CountersBegin = (char *)ContiguousData.data(); - CountersEnd = (char *)(ContiguousData.data() + CountsSize); - NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize); - NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize + - NamesData.size()); - - // Copy data to contiguous buffer - memcpy(DataBegin, Data.data(), DataSize); - memcpy(CountersBegin, Counts.data(), CountsSize); - memcpy(NamesBegin, NamesData.data(), NamesData.size()); - - // Invoke compiler-rt entrypoint - int result = __llvm_write_custom_profile( - TargetTriple.str().c_str(), DataBegin, DataEnd, CountersBegin, - CountersEnd, NamesBegin, NamesEnd, &Version); - if (result != 0) + // The sections must be laid out contiguously so that lprofWriteDataImpl + // computes the correct CountersDelta from the pointer arithmetic. + // TODO: Move this interface to compiler-rt. + SmallVector Buffer(CountersSection.size() + DataSection.size() + + NamesSection.size()); + char *CountersBegin = Buffer.data(); + char *DataBegin = CountersBegin + CountersSection.size(); + char *NamesBegin = DataBegin + DataSection.size(); + + memcpy(CountersBegin, CountersSection.data(), CountersSection.size()); + memcpy(DataBegin, DataSection.data(), DataSection.size()); + memcpy(NamesBegin, NamesSection.data(), NamesSection.size()); + + int Result = __llvm_write_custom_profile( + TargetTriple.str().c_str(), + reinterpret_cast(DataBegin), + reinterpret_cast(DataBegin + + DataSection.size()), + CountersBegin, CountersBegin + CountersSection.size(), NamesBegin, + NamesBegin + NamesSection.size(), &Version); + if (Result != 0) return Plugin::error(ErrorCode::HOST_IO, "error writing GPU PGO data to file"); @@ -322,5 +292,5 @@ Error GPUProfGlobals::write() const { } bool GPUProfGlobals::empty() const { - return Counts.empty() && Data.empty() && NamesData.empty(); + return CountersSection.empty() && DataSection.empty() && NamesSection.empty(); }