diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h index aece53d7ee1ca..c6aacf4ce2124 100644 --- a/openmp/libomptarget/include/Shared/PluginAPI.h +++ b/openmp/libomptarget/include/Shared/PluginAPI.h @@ -219,9 +219,6 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize, void *VAddr, bool isRecord, bool SaveOutput, uint64_t &ReqPtrArgOffset); - -// Returns true if the device \p DeviceId suggests to use auto zero-copy. -int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId); } #endif // OMPTARGET_SHARED_PLUGIN_API_H diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc index b842c6eef1d4f..25ebe7d437f9d 100644 --- a/openmp/libomptarget/include/Shared/PluginAPI.inc +++ b/openmp/libomptarget/include/Shared/PluginAPI.inc @@ -47,4 +47,3 @@ PLUGIN_API_HANDLE(data_notify_mapped, false); PLUGIN_API_HANDLE(data_notify_unmapped, false); PLUGIN_API_HANDLE(set_device_offset, false); PLUGIN_API_HANDLE(initialize_record_replay, false); -PLUGIN_API_HANDLE(use_auto_zero_copy, false); diff --git a/openmp/libomptarget/include/Shared/Requirements.h b/openmp/libomptarget/include/Shared/Requirements.h index b16a1650f0c40..19d6b8ffca495 100644 --- a/openmp/libomptarget/include/Shared/Requirements.h +++ b/openmp/libomptarget/include/Shared/Requirements.h @@ -33,12 +33,7 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t { /// unified_shared_memory clause. OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008, /// dynamic_allocators clause. - OMP_REQ_DYNAMIC_ALLOCATORS = 0x010, - /// Auto zero-copy extension: - /// when running on an APU, the GPU plugin may decide to - /// run in zero-copy even though the user did not program - /// their application with unified_shared_memory requirement. - OMPX_REQ_AUTO_ZERO_COPY = 0x020 + OMP_REQ_DYNAMIC_ALLOCATORS = 0x010 }; class RequirementCollection { @@ -70,14 +65,6 @@ class RequirementCollection { return; } - // Auto zero-copy is only valid when no other requirement has been set - // and it is computed at device initialization time, after the requirement - // flag has already been set to OMP_REQ_NONE. - if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) { - SetFlags = NewFlags; - return; - } - // If multiple compilation units are present enforce // consistency across all of them for require clauses: // - reverse_offload diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h index 8b4396ac468d7..d28d3c508faf5 100644 --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -164,9 +164,6 @@ struct DeviceTy { /// Print all offload entries to stderr. void dumpOffloadEntries(); - /// Ask the device whether the runtime should use auto zero-copy. - bool useAutoZeroCopy(); - private: /// Deinitialize the device (and plugin). void deinit(); diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index b5f0baee23dc2..b67642e9e1bcb 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1848,9 +1848,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000), OMPX_UseMultipleSdmaEngines( "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false), - HSAXnackEnv("HSA_XNACK", false), AMDGPUStreamManager(*this, Agent), - AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), - HostDevice(HostDevice) {} + AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this), + AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {} ~AMDGPUDeviceTy() {} @@ -1941,10 +1940,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals)) return Err; - // detect if device is an APU. - if (auto Err = checkIfAPU()) - return Err; - return Plugin::success(); } @@ -2636,14 +2631,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } - /// Returns true if auto zero-copy the best configuration for the current - /// arch. - bool useAutoZeroCopyImpl() override { - // XNACK can be enabled with with kernel boot parameter or with - // environment variable. - return (IsAPU && (HSAXnackEnv || utils::isXnackEnabledViaKernelParam())); - } - /// Getters and setters for stack and heap sizes. Error getDeviceStackSize(uint64_t &Value) override { Value = StackSize; @@ -2741,30 +2728,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; } - /// Detect if current architecture is an APU. - Error checkIfAPU() { - std::string StrGfxName(ComputeUnitKind); - std::transform(std::begin(StrGfxName), std::end(StrGfxName), - std::begin(StrGfxName), - [](char c) { return std::tolower(c); }); - if (StrGfxName == "gfx940") { - IsAPU = true; - return Plugin::success(); - } - if (StrGfxName == "gfx942") { - // can be MI300A or MI300X - uint32_t ChipID = 0; - if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID)) - return Err; - - if (!(ChipID & 0x1)) { - IsAPU = true; - return Plugin::success(); - } - } - return Plugin::success(); - } - /// Envar for controlling the number of HSA queues per device. High number of /// queues may degrade performance. UInt32Envar OMPX_NumQueues; @@ -2801,9 +2764,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Use ROCm 5.7 interface for multiple SDMA engines BoolEnvar OMPX_UseMultipleSdmaEngines; - /// Value of HSA_XNACK environment variable. - BoolEnvar HSAXnackEnv; - /// Stream manager for AMDGPU streams. AMDGPUStreamManagerTy AMDGPUStreamManager; @@ -2834,9 +2794,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// The current size of the stack that will be used in cases where it could /// not be statically determined. uint64_t StackSize = 16 * 1024 /* 16 KB */; - - /// Is the plugin associated with an APU? - bool IsAPU{false}; }; Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index c5a58f8244145..58a3b5df00fac 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -116,34 +116,6 @@ inline bool isImageCompatibleWithEnv(StringRef ImageArch, uint32_t ImageFlags, return true; } -inline bool isXnackEnabledViaKernelParam() { - - ErrorOr> FileOrError = - MemoryBuffer::getFileAsStream("/proc/cmdline"); - - if (std::error_code ErrorCode = FileOrError.getError()) { - FAILURE_MESSAGE("Cannot open /proc/cmdline : %s\n", - ErrorCode.message().c_str()); - return false; - } - - StringRef FileContent = (FileOrError.get())->getBuffer(); - - StringRef RefString("amdgpu.noretry="); - int SizeOfRefString = RefString.size(); - - size_t Pos = FileContent.find_insensitive(RefString); - // Is noretry defined? - if (Pos != StringRef::npos) { - bool NoRetryValue = FileContent[Pos + SizeOfRefString] - '0'; - // is noretry set to 0 - if (!NoRetryValue) - return true; - } - - return false; -} - struct KernelMetaDataTy { uint64_t KernelObject; uint32_t GroupSegmentList; diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h index abe85f43c2e72..b85dc146d86d2 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h @@ -872,11 +872,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error getDeviceStackSize(uint64_t &V) = 0; - /// Returns true if current plugin architecture is an APU - /// and unified_shared_memory was not requested by the program. - bool useAutoZeroCopy(); - virtual bool useAutoZeroCopyImpl() { return false; } - private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index e82c2f7bef14f..9490e58fc669c 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -1561,8 +1561,6 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) { return syncEventImpl(EventPtr); } -bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); } - Error GenericPluginTy::init() { auto NumDevicesOrErr = initImpl(); if (!NumDevicesOrErr) @@ -2075,14 +2073,6 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) { return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) { - // Automatic zero-copy only applies to programs that did - // not request unified_shared_memory and are deployed on an - // APU with XNACK enabled. - if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) - return false; - return Plugin::get().getDevice(DeviceId).useAutoZeroCopy(); -} #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp index 87ab70dec2a2d..a5c24810e0af9 100644 --- a/openmp/libomptarget/src/OpenMP/Mapping.cpp +++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp @@ -252,9 +252,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer( MESSAGE("device mapping required by 'present' map type modifier does not " "exist for host address " DPxMOD " (%" PRId64 " bytes)", DPxPTR(HstPtrBegin), Size); - } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY && - !HasCloseModifier) || - (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) { + } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY && + !HasCloseModifier) { // If unified shared memory is active, implicitly mapped variables that are // not privatized use host address. Any explicitly mapped variables also use // host address where correctness is not impeded. In all other cases maps @@ -262,10 +261,6 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer( // In addition to the mapping rules above, the close map modifier forces the // mapping of the variable to the device. if (Size) { - INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID, - "Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared " - "memory\n", - DPxPTR((uintptr_t)HstPtrBegin), Size); DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared " "memory\n", DPxPTR((uintptr_t)HstPtrBegin), Size); @@ -420,8 +415,7 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin( LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction, LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction); LR.TPR.TargetPointer = (void *)TP; - } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY || - PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) { + } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) { // If the value isn't found in the mapping and unified shared memory // is on then it means we have stumbled upon a value which we need to // use directly from the host. diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp index 82b0ecdcd647a..da2e08180eead 100644 --- a/openmp/libomptarget/src/PluginManager.cpp +++ b/openmp/libomptarget/src/PluginManager.cpp @@ -144,33 +144,19 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) { int32_t NumPD = getNumberOfPluginDevices(); ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD); - // Auto zero-copy is a per-device property. We need to ensure - // that all devices are suggesting to use it. - bool UseAutoZeroCopy = true; - if (NumPD == 0) - UseAutoZeroCopy = false; for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) { auto Device = std::make_unique(this, UserDevId, PDevI); - if (auto Err = Device->init()) { DP("Skip plugin known device %d: %s\n", PDevI, toString(std::move(Err)).c_str()); continue; } - UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy(); ExclusiveDevicesAccessor->push_back(std::move(Device)); ++NumberOfUserDevices; ++UserDevId; } - // Auto Zero-Copy can only be currently triggered when the system is an - // homogeneous APU architecture without attached discrete GPUs. - // If all devices suggest to use it, change requirment flags to trigger - // zero-copy behavior when mapping memory. - if (UseAutoZeroCopy) - PM.addRequirements(OMPX_REQ_AUTO_ZERO_COPY); - DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n", DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices, NumberOfPluginDevices); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 919c4b55c0365..dbad13b92bcc1 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -340,9 +340,3 @@ void DeviceTy::dumpOffloadEntries() { fprintf(stderr, " %11s: %s\n", Kind, It.second->getNameAsCStr()); } } - -bool DeviceTy::useAutoZeroCopy() { - if (RTL->use_auto_zero_copy) - return RTL->use_auto_zero_copy(RTLDeviceID); - return false; -} diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp deleted file mode 100644 index 80bb3d24a0c6e..0000000000000 --- a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp +++ /dev/null @@ -1,59 +0,0 @@ - -// RUN: %libomptarget-compilexx-generic -// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \ -// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK - -// RUN: %libomptarget-compilexx-generic -// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=30 USE_USM=1 %libomptarget-run-generic 2>&1 \ -// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK - -// RUN: %libomptarget-compilexx-generic -// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \ -// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -#include - -#if (USE_USM == 1) -#pragma omp requires unified_shared_memory -#endif - -int main() { - int n = 1024; - - // test various mapping types - int *a = new int[n]; - int k = 3; - int b[n]; - - for (int i = 0; i < n; i++) - b[i] = i; - - // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory - // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory - - // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, - // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, - // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed) - // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed) -#pragma omp target teams distribute parallel for map(tofrom : a[ : n]) \ - map(to : b[ : n]) - for (int i = 0; i < n; i++) - a[i] = i + b[i] + k; - - int err = 0; - for (int i = 0; i < n; i++) - if (a[i] != i + b[i] + k) - err++; - - // CHECK: PASS - if (err == 0) - printf("PASS\n"); - return err; -}