diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp index a9aabbf39e0e7..e3f9f71a3068a 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -42,13 +42,13 @@ static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest, return err; } -struct atmiFreePtrDeletor { +struct implFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free } }; -hsa_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest, +hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, const void *hostSrc, size_t size, hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool) { @@ -68,7 +68,7 @@ hsa_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest, size); return ret; } - std::unique_ptr del(tempHostPtr); + std::unique_ptr del(tempHostPtr); memcpy(tempHostPtr, hostSrc, size); if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) != @@ -78,7 +78,7 @@ hsa_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest, return HSA_STATUS_SUCCESS; } -hsa_status_t atmi_memcpy_d2h(hsa_signal_t signal, void *dest, +hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest, const void *deviceSrc, size_t size, hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool) { @@ -98,7 +98,7 @@ hsa_status_t atmi_memcpy_d2h(hsa_signal_t signal, void *dest, size); return ret; } - std::unique_ptr del(tempHostPtr); + std::unique_ptr del(tempHostPtr); if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) != HSA_STATUS_SUCCESS) { diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.h b/openmp/libomptarget/plugins/amdgpu/impl/impl.h deleted file mode 100644 index 81c32a9487c60..0000000000000 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl.h +++ /dev/null @@ -1,39 +0,0 @@ -//===--- amdgpu/impl/impl.h --------------------------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef INCLUDE_IMPL_H_ -#define INCLUDE_IMPL_H_ - -#define ROCM_VERSION_MAJOR 3 -#define ROCM_VERSION_MINOR 2 - -/** \defgroup enumerations Enumerated Types - * @{ - */ - -/** - * @brief Device Types. - */ -typedef enum atmi_devtype_s { - ATMI_DEVTYPE_CPU = 0x0001, - ATMI_DEVTYPE_iGPU = 0x0010, // Integrated GPU - ATMI_DEVTYPE_dGPU = 0x0100, // Discrete GPU - ATMI_DEVTYPE_GPU = ATMI_DEVTYPE_iGPU | ATMI_DEVTYPE_dGPU, // Any GPU - ATMI_DEVTYPE_ALL = 0x111 // Union of all device types -} atmi_devtype_t; - -/** - * @brief Memory Access Type. - */ -typedef enum atmi_memtype_s { - ATMI_MEMTYPE_FINE_GRAINED = 0, - ATMI_MEMTYPE_COARSE_GRAINED = 1, - ATMI_MEMTYPE_ANY -} atmi_memtype_t; - -/** @} */ -#endif // INCLUDE_IMPL_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h index 8d85458cc375c..52efaffb515f0 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h @@ -8,63 +8,25 @@ #ifndef INCLUDE_IMPL_RUNTIME_H_ #define INCLUDE_IMPL_RUNTIME_H_ -#include "impl.h" #include "hsa_api.h" extern "C" { -/** \defgroup module_functions ATMI Module - * @{ - */ - -/** - * @brief Register the ATMI code module from memory on to a specific place - * (device). - * - * @detail Currently, only GPU devices need explicit module registration because - * of their specific ISAs that require a separate compilation phase. On the - * other - * hand, CPU devices execute regular x86 functions that are compiled with the - * host program. - * - * @param[in] module_bytes A memory region that contains the GPU modules - * targeting ::AMDGCN platform types. Value cannot be NULL. - * - * @param[in] module_size Size of module region - * - * @param[in] place Denotes the execution place (device) on which the module - * should be registered and loaded. - * - * @param[in] on_deserialized_data Callback run on deserialized code object, - * before loading it - * - * @param[in] cb_state void* passed to on_deserialized_data callback - * - * @retval ::HSA_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::HSA_STATUS_ERROR The function encountered errors. - * - */ -hsa_status_t atmi_module_register_from_memory_to_place( +hsa_status_t impl_module_register_from_memory_to_place( void *module_bytes, size_t module_size, int DeviceId, hsa_status_t (*on_deserialized_data)(void *data, size_t size, void *cb_state), void *cb_state); -/** @} */ - -hsa_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest, +hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, const void *hostSrc, size_t size, hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool); -hsa_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *hostDest, +hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, const void *deviceSrc, size_t size, hsa_agent_t agent, hsa_amd_memory_pool_t MemoryPool); - -/** @} */ - } #endif // INCLUDE_IMPL_RUNTIME_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h index f5588ea15a0b1..e5cf047256b70 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -23,13 +23,12 @@ #include "hsa_api.h" -#include "impl.h" #include "impl_runtime.h" #include "rt.h" #define MAX_NUM_KERNELS (1024 * 16) -typedef struct atmi_implicit_args_s { +typedef struct impl_implicit_args_s { unsigned long offset_x; unsigned long offset_y; unsigned long offset_z; @@ -40,7 +39,7 @@ typedef struct atmi_implicit_args_s { unsigned long cpu_worker_signals; unsigned long cpu_queue_ptr; unsigned long kernarg_template_ptr; -} atmi_implicit_args_t; +} impl_implicit_args_t; extern "C" { @@ -177,7 +176,7 @@ template inline T *alignUp(T *value, size_t alignment) { alignDown((intptr_t)(value + alignment - 1), alignment)); } -extern bool atl_is_atmi_initialized(); +extern bool atl_is_impl_initialized(); bool handle_group_signal(hsa_signal_value_t value, void *arg); diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp index 7043cc836a419..af3e392ad4a95 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp @@ -8,16 +8,16 @@ #include "interop_hsa.h" #include "internal.h" -hsa_status_t atmi_interop_hsa_get_symbol_info( +hsa_status_t interop_hsa_get_symbol_info( const std::map &SymbolInfoTable, int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size) { /* // Typical usage: void *var_addr; size_t var_size; - atmi_interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr, + interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr, &var_size); - atmi_memcpy(signal, host_add, var_addr, var_size); + impl_memcpy(signal, host_add, var_addr, var_size); */ if (!symbol || !var_addr || !var_size) @@ -38,14 +38,14 @@ hsa_status_t atmi_interop_hsa_get_symbol_info( } } -hsa_status_t atmi_interop_hsa_get_kernel_info( +hsa_status_t interop_hsa_get_kernel_info( const std::map &KernelInfoTable, int DeviceId, const char *kernel_name, hsa_executable_symbol_info_t kernel_info, uint32_t *value) { /* // Typical usage: uint32_t value; - atmi_interop_hsa_get_kernel_addr(gpu_place, "kernel_name", + interop_hsa_get_kernel_addr(gpu_place, "kernel_name", HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &val); */ @@ -68,7 +68,7 @@ hsa_status_t atmi_interop_hsa_get_kernel_info( break; case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE: // return the size for non-implicit args - *value = info.kernel_segment_size - sizeof(atmi_implicit_args_t); + *value = info.kernel_segment_size - sizeof(impl_implicit_args_t); break; default: *value = 0; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h index f98e0532bbd25..86fa49ef63517 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h @@ -17,69 +17,15 @@ extern "C" { -/** \defgroup interop_hsa_functions HSA Interop - * @{ - */ - -/** - * @brief Get the device address and size of an HSA global symbol - * - * @detail Use this function to query the device address and size of an HSA - * global symbol. - * The symbol can be set at by the compiler or by the application writer in a - * language-specific manner. This function is meaningful only after calling one - * of the @p atmi_module_register functions. - * - * @param[in] place The ATMI memory place - * - * @param[in] symbol Pointer to a non-NULL global symbol name - * - * @param[in] var_addr Pointer to a non-NULL @p void* variable that will - * hold the device address of the global symbol object. - * - * @param[in] var_size Pointer to a non-NULL @p uint variable that will - * hold the size of the global symbol object. - * - * @retval ::HSA_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::HSA_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are - * invalid - * location in the current node, or if ATMI is not initialized. - */ -hsa_status_t atmi_interop_hsa_get_symbol_info( +hsa_status_t interop_hsa_get_symbol_info( const std::map &SymbolInfoTable, int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size); -/** - * @brief Get the HSA-specific kernel info from a kernel name - * - * @detail Use this function to query the HSA-specific kernel info from the - * kernel name. - * This function is meaningful only after calling one - * of the @p atmi_module_register functions. - * - * @param[in] place The ATMI memory place - * - * @param[in] kernel_name Pointer to a char array with the kernel name - * - * @param[in] info The different possible kernel properties - * - * @param[in] value Pointer to a non-NULL @p uint variable that will - * hold the return value of the kernel property. - * - * @retval ::HSA_STATUS_SUCCESS The function has executed successfully. - * - * @retval ::HSA_STATUS_ERROR If @p symbol, @p var_addr or @p var_size are - * invalid - * location in the current node, or if ATMI is not initialized. - */ -hsa_status_t atmi_interop_hsa_get_kernel_info( +hsa_status_t interop_hsa_get_kernel_info( const std::map &KernelInfoTable, int DeviceId, const char *kernel_name, hsa_executable_symbol_info_t info, uint32_t *value); -/** @} */ - } #endif // INCLUDE_INTEROP_HSA_H_ diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index 94c60d1c06156..06a8d34c28e5f 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -142,12 +142,7 @@ static const std::map ArgValueKind = { namespace core { hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) { -#if (ROCM_VERSION_MAJOR >= 3) || \ - (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3) if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) { -#else - if (event->event_type == GPU_MEMORY_FAULT_EVENT) { -#endif hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault; // memory_fault.agent // memory_fault.virtual_address @@ -550,7 +545,7 @@ static hsa_status_t get_code_object_custom_metadata( // implicit args by discounting the compiler set implicit args info.kernel_segment_size = (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) + - sizeof(atmi_implicit_args_t); + sizeof(impl_implicit_args_t); DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), kernel_segment_size, info.kernel_segment_size); @@ -755,13 +750,13 @@ hsa_status_t RegisterModuleFromMemory( // Mutating the device image here avoids another allocation & memcpy void *code_object_alloc_data = reinterpret_cast(code_object.handle); - hsa_status_t atmi_err = + hsa_status_t impl_err = on_deserialized_data(code_object_alloc_data, module_size, cb_state); - if (atmi_err != HSA_STATUS_SUCCESS) { + if (impl_err != HSA_STATUS_SUCCESS) { printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Error in deserialized_data callback", - get_error_string(atmi_err)); - return atmi_err; + get_error_string(impl_err)); + return impl_err; } /* Load the code object. */ diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 4b2f2f2fc3019..15f67cea1ea98 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -139,7 +139,7 @@ struct KernelArgPool { std::queue free_kernarg_segments; uint32_t kernarg_size_including_implicit() { - return kernarg_segment_size + sizeof(atmi_implicit_args_t); + return kernarg_segment_size + sizeof(impl_implicit_args_t); } ~KernelArgPool() { @@ -160,7 +160,7 @@ struct KernelArgPool { hsa_amd_memory_pool_t &memory_pool) : kernarg_segment_size(kernarg_segment_size) { - // atmi uses one pool per kernel for all gpus, with a fixed upper size + // impl uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue hsa_status_t err = hsa_amd_memory_pool_allocate( @@ -494,14 +494,14 @@ class RTLDeviceInfoTy { std::vector DeviceFineGrainedMemoryPools; std::vector DeviceCoarseGrainedMemoryPools; - struct atmiFreePtrDeletor { + struct implFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free } }; // device_State shared across loaded binaries, error if inconsistent size - std::vector, uint64_t>> + std::vector, uint64_t>> deviceStateStore; static const unsigned HardTeamLimit = @@ -529,12 +529,12 @@ class RTLDeviceInfoTy { hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src, size_t size, int32_t deviceId) { - return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId); + return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId); } hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src, size_t size, int32_t deviceId) { - return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId); + return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId); } // Record entry point associated with device @@ -801,7 +801,7 @@ class RTLDeviceInfoTy { return; } // Run destructors on types that use HSA before - // atmi_finalize removes access to it + // impl_finalize removes access to it deviceStateStore.clear(); KernelArgPoolMap.clear(); // Terminate hostrpc before finalizing ATMI @@ -1393,7 +1393,7 @@ struct device_environment { auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; void *state_ptr; uint32_t state_ptr_size; - hsa_status_t err = atmi_interop_hsa_get_symbol_info( + hsa_status_t err = interop_hsa_get_symbol_info( SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size); if (err != HSA_STATUS_SUCCESS) { DP("failed to find %s in loaded image\n", sym()); @@ -1414,7 +1414,7 @@ struct device_environment { } }; -static hsa_status_t atmi_calloc(void **ret_ptr, size_t size, int DeviceId) { +static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) { uint64_t rounded = 4 * ((size + 3) / 4); void *ptr; hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); @@ -1524,7 +1524,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, void *state_ptr; uint32_t state_ptr_size; auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = atmi_interop_hsa_get_symbol_info( + hsa_status_t err = interop_hsa_get_symbol_info( SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, &state_ptr_size); @@ -1552,13 +1552,13 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, if (dss.first.get() == nullptr) { assert(dss.second == 0); void *ptr = NULL; - hsa_status_t err = atmi_calloc(&ptr, device_State_bytes, device_id); + hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id); if (err != HSA_STATUS_SUCCESS) { DP("Failed to allocate device_state array\n"); return NULL; } dss = { - std::unique_ptr{ptr}, + std::unique_ptr{ptr}, device_State_bytes, }; } @@ -1608,7 +1608,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, uint32_t varsize; auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; - hsa_status_t err = atmi_interop_hsa_get_symbol_info( + hsa_status_t err = interop_hsa_get_symbol_info( SymbolInfoMap, device_id, e->name, &varptr, &varsize); if (err != HSA_STATUS_SUCCESS) { @@ -1650,7 +1650,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, uint32_t kernarg_segment_size; auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; - hsa_status_t err = atmi_interop_hsa_get_kernel_info( + hsa_status_t err = interop_hsa_get_kernel_info( KernelInfoMap, device_id, e->name, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); @@ -2149,7 +2149,7 @@ int32_t __tgt_rtl_run_target_team_region_locked( packet->group_segment_size = KernelInfoEntry.group_segment_size; packet->kernel_object = KernelInfoEntry.kernel_object; packet->kernarg_address = 0; // use the block allocator - packet->reserved2 = 0; // atmi writes id_ here + packet->reserved2 = 0; // impl writes id_ here packet->completion_signal = {0}; // may want a pool of signals KernelArgPool *ArgPool = nullptr; @@ -2181,11 +2181,11 @@ int32_t __tgt_rtl_run_target_team_region_locked( // Initialize implicit arguments. ATMI seems to leave most fields // uninitialized - atmi_implicit_args_t *impl_args = - reinterpret_cast( + impl_implicit_args_t *impl_args = + reinterpret_cast( static_cast(kernarg) + ArgPool->kernarg_segment_size); memset(impl_args, 0, - sizeof(atmi_implicit_args_t)); // may not be necessary + sizeof(impl_implicit_args_t)); // may not be necessary impl_args->offset_x = 0; impl_args->offset_y = 0; impl_args->offset_z = 0;