diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp index 512577c06f9eb..6dd935e1128ad 100644 --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -16,7 +16,7 @@ #include "Utils.h" #pragma omp begin declare target device_type(nohost) - +extern const uint16_t __oclc_ABI_version; #include "llvm/Frontend/OpenMP/OMPGridValues.h" using namespace _OMP; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp index 69f2a716a8fd6..f4a4ceaa92a81 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp @@ -11,6 +11,7 @@ // identifier) and contains more up to date values for the enum checked here. // rtl.cpp uses the system elf.h. #include "llvm/BinaryFormat/ELF.h" +using namespace llvm::ELF; const char *get_elf_mach_gfx_name(uint32_t EFlags) { using namespace llvm::ELF; @@ -78,3 +79,8 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags) { return "--unknown gfx"; } } + +const uint16_t implicitArgsSize(uint16_t Version) { + return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE + : IMPLICITARGS::COV5_SIZE; +} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h index 177963e1b8b5c..a5404bd3d7934 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h @@ -12,4 +12,49 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags); +enum IMPLICITARGS : uint16_t { + COV4_SIZE = 56, + COV4_HOSTCALL_PTR_OFFSET = 24, + HOSTCALL_PTR_SIZE = 8, + + COV5_SIZE = 256, + + COV5_BLOCK_COUNT_X_OFFSET = 0, + COV5_BLOCK_COUNT_X_SIZE = 4, + + COV5_BLOCK_COUNT_Y_OFFSET = 4, + COV5_BLOCK_COUNT_Y_SIZE = 4, + + COV5_BLOCK_COUNT_Z_OFFSET = 8, + COV5_BLOCK_COUNT_Z_SIZE = 4, + + COV5_GROUP_SIZE_X_OFFSET = 12, + COV5_GROUP_SIZE_X_SIZE = 2, + + COV5_GROUP_SIZE_Y_OFFSET = 14, + COV5_GROUP_SIZE_Y_SIZE = 2, + + COV5_GROUP_SIZE_Z_OFFSET = 16, + COV5_GROUP_SIZE_Z_SIZE = 2, + + COV5_REMAINDER_X_OFFSET = 18, + COV5_REMAINDER_X_SIZE = 2, + + COV5_REMAINDER_Y_OFFSET = 20, + COV5_REMAINDER_Y_SIZE = 2, + + COV5_REMAINDER_Z_OFFSET = 22, + COV5_REMAINDER_Z_SIZE = 2, + + COV5_GRID_DIMS_OFFSET = 64, + COV5_GRID_DIMS_SIZE = 2, + + COV5_HOSTCALL_PTR_OFFSET = 80, + + COV5_HEAPV1_PTR_OFFSET = 96, + COV5_HEAPV1_PTR_SIZE = 8 +}; + +const uint16_t implicitArgsSize(uint16_t Version); + #endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h index 63b60b24a5570..dc94b0ed01f20 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -33,17 +33,6 @@ #define MAX_NUM_KERNELS (1024 * 16) -typedef struct impl_implicit_args_s { - uint64_t offset_x; - uint64_t offset_y; - uint64_t offset_z; - uint64_t hostcall_ptr; - uint64_t unused0; - uint64_t unused1; - uint64_t unused2; -} impl_implicit_args_t; -static_assert(sizeof(impl_implicit_args_t) == 56, ""); - // ---------------------- Kernel Start ------------- typedef struct atl_kernel_info_s { uint64_t kernel_object; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index e8dba47b6cde0..0170cd4440f54 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -67,6 +67,17 @@ class KernelArgMD { HiddenMultiGridSyncArg, HiddenHostcallBuffer, HiddenHeapV1, + HiddenBlockCountX, + HiddenBlockCountY, + HiddenBlockCountZ, + HiddenGroupSizeX, + HiddenGroupSizeY, + HiddenGroupSizeZ, + HiddenRemainderX, + HiddenRemainderY, + HiddenRemainderZ, + HiddenGridDims, + HiddenQueuePtr, Unknown }; @@ -102,7 +113,19 @@ static const std::map ArgValueKind = { {"hidden_multigrid_sync_arg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, - {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}}; + {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}, + {"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX}, + {"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY}, + {"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ}, + {"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX}, + {"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY}, + {"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ}, + {"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX}, + {"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY}, + {"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ}, + {"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims}, + {"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr}, +}; namespace core { @@ -164,6 +187,17 @@ static bool isImplicit(KernelArgMD::ValueKind value_kind) { case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: case KernelArgMD::ValueKind::HiddenHostcallBuffer: case KernelArgMD::ValueKind::HiddenHeapV1: + case KernelArgMD::ValueKind::HiddenBlockCountX: + case KernelArgMD::ValueKind::HiddenBlockCountY: + case KernelArgMD::ValueKind::HiddenBlockCountZ: + case KernelArgMD::ValueKind::HiddenGroupSizeX: + case KernelArgMD::ValueKind::HiddenGroupSizeY: + case KernelArgMD::ValueKind::HiddenGroupSizeZ: + case KernelArgMD::ValueKind::HiddenRemainderX: + case KernelArgMD::ValueKind::HiddenRemainderY: + case KernelArgMD::ValueKind::HiddenRemainderZ: + case KernelArgMD::ValueKind::HiddenGridDims: + case KernelArgMD::ValueKind::HiddenQueuePtr: return true; default: return false; @@ -473,8 +507,7 @@ static hsa_status_t get_code_object_custom_metadata( size_t new_offset = lcArg.offset_; size_t padding = new_offset - offset; offset = new_offset; - DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_, - lcArg.offset_); + offset += lcArg.size_; // check if the arg is a hidden/implicit arg @@ -482,9 +515,13 @@ static hsa_status_t get_code_object_custom_metadata( if (!isImplicit(lcArg.valueKind_)) { info.explicit_argument_count++; kernel_explicit_args_size += lcArg.size_; + DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, + lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); } else { info.implicit_argument_count++; hasHiddenArgs = true; + DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, + lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); } kernel_explicit_args_size += padding; } @@ -492,7 +529,7 @@ static hsa_status_t get_code_object_custom_metadata( // TODO: Probably don't want this arithmetic info.kernel_segment_size = - (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); + (!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), kernel_segment_size, info.kernel_segment_size); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index b0e29cb6e4e96..38879c8e6eb87 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -124,9 +124,10 @@ struct KernelArgPool { uint32_t KernargSegmentSize; void *KernargRegion = nullptr; std::queue FreeKernargSegments; + uint16_t CodeObjectVersion; uint32_t kernargSizeIncludingImplicit() { - return KernargSegmentSize + sizeof(impl_implicit_args_t); + return KernargSegmentSize + implicitArgsSize(CodeObjectVersion); } ~KernelArgPool() { @@ -143,8 +144,10 @@ struct KernelArgPool { KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) - : KernargSegmentSize(KernargSegmentSize) { + KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool, + uint16_t CodeObjectVersion) + : KernargSegmentSize(KernargSegmentSize), + CodeObjectVersion(CodeObjectVersion) { // impl uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue @@ -228,16 +231,16 @@ struct KernelTy { KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, int32_t DeviceId, void *CallStackAddr, const char *Name, uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) + hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion) : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); std::string N(Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool)))); + KernelArgPoolMap.insert(std::make_pair( + N, std::unique_ptr(new KernelArgPool( + KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion)))); } } }; @@ -474,6 +477,7 @@ class RTLDeviceInfoTy : HSALifetime { std::vector WarpSize; std::vector GPUName; std::vector TargetID; + uint16_t CodeObjectVersion; // OpenMP properties std::vector NumTeams; @@ -487,6 +491,7 @@ class RTLDeviceInfoTy : HSALifetime { // Resource pools SignalPoolT FreeSignalPool; + std::vector PreallocatedDeviceHeap; bool HostcallRequired = false; @@ -861,7 +866,6 @@ class RTLDeviceInfoTy : HSALifetime { "Unexpected device id!"); FuncGblEntries[DeviceId].emplace_back(); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - // KernelArgPoolMap.clear(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } @@ -1032,6 +1036,7 @@ class RTLDeviceInfoTy : HSALifetime { SymbolInfoTable.resize(NumberOfDevices); DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); DeviceFineGrainedMemoryPools.resize(NumberOfDevices); + PreallocatedDeviceHeap.resize(NumberOfDevices); Err = setupDevicePools(HSAAgents); if (Err != HSA_STATUS_SUCCESS) { @@ -1361,6 +1366,27 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { return PacketId; } +const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) { + char *ImageBegin = (char *)Image->ImageStart; + size_t ImageSize = (char *)Image->ImageEnd - ImageBegin; + + StringRef Buffer = StringRef(ImageBegin, ImageSize); + auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), + /*InitContent=*/false); + if (!ElfOrErr) { + REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); + return 1; + } + + if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) { + auto Header = ELFObj->getELFFile().getHeader(); + uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]); + DP("ELFABIVERSION Version: %u\n", Version); + return Version; + } + return 0; +} + int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount) { @@ -1438,6 +1464,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, } uint64_t PacketId = acquireAvailablePacketId(Queue); + uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion; const uint32_t Mask = Queue->size - 1; // size is a power of 2 hsa_kernel_dispatch_packet_t *Packet = (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); @@ -2160,6 +2187,40 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, return Res; } +static void preAllocateHeapMemoryForCov5() { + void *DevPtr; + for (int I = 0; I < DeviceInfo().NumberOfDevices; I++) { + DevPtr = nullptr; + size_t PreAllocSize = 131072; // 128KB per device + + hsa_amd_memory_pool_t MemoryPool = + DeviceInfo().DeviceCoarseGrainedMemoryPools[I]; + hsa_status_t Err = + hsa_amd_memory_pool_allocate(MemoryPool, PreAllocSize, 0, &DevPtr); + if (Err != HSA_STATUS_SUCCESS) { + DP("Error allocating preallocated heap device memory: %s\n", + get_error_string(Err)); + } + + Err = hsa_amd_agents_allow_access(1, &DeviceInfo().HSAAgents[I], NULL, + DevPtr); + if (Err != HSA_STATUS_SUCCESS) { + DP("hsa allow_access_to_all_gpu_agents failed: %s\n", + get_error_string(Err)); + } + + uint64_t Rounded = + sizeof(uint32_t) * ((PreAllocSize + 3) / sizeof(uint32_t)); + Err = hsa_amd_memory_fill(DevPtr, 0, Rounded / sizeof(uint32_t)); + if (Err != HSA_STATUS_SUCCESS) { + DP("Error zero-initializing preallocated heap device memory:%s\n", + get_error_string(Err)); + } + + DeviceInfo().PreallocatedDeviceHeap[I] = DevPtr; + } +} + __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image) { // This function loads the device image onto gpu[DeviceId] and does other @@ -2194,6 +2255,12 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, if (!elfMachineIdIsAmdgcn(Image)) return NULL; + DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image); + if (DeviceInfo().CodeObjectVersion >= + llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) { + preAllocateHeapMemoryForCov5(); + } + { auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, @@ -2517,7 +2584,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); + DeviceInfo().KernArgPool, + DeviceInfo().CodeObjectVersion)); __tgt_offload_entry Entry = *E; Entry.addr = (void *)&KernelsList.back(); DeviceInfo().addOffloadEntry(DeviceId, Entry); diff --git a/openmp/runtime/cmake/LibompHandleFlags.cmake b/openmp/runtime/cmake/LibompHandleFlags.cmake index a6adbe3f2f540..684eae9f0b258 100644 --- a/openmp/runtime/cmake/LibompHandleFlags.cmake +++ b/openmp/runtime/cmake/LibompHandleFlags.cmake @@ -100,7 +100,7 @@ function(libomp_get_ldflags ldflags) libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}" IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG) libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG) - libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) + libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) libomp_append(ldflags_local "-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858 libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG) libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG) diff --git a/openmp/runtime/cmake/config-ix.cmake b/openmp/runtime/cmake/config-ix.cmake index 1e02d5a8b5cf1..d1346121edf53 100644 --- a/openmp/runtime/cmake/config-ix.cmake +++ b/openmp/runtime/cmake/config-ix.cmake @@ -131,7 +131,7 @@ if(WIN32) elseif(NOT APPLE) libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG) libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG) - libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) + libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG) libomp_check_linker_flag("-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858 libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG) libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)