diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index 818967c88904e..19e072abc402e 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -450,7 +450,8 @@ void __tgt_set_info_flag(uint32_t); int __tgt_print_device_info(int64_t DeviceId); int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, - void *VAddr, bool IsRecord, bool SaveOutput); + void *VAddr, bool IsRecord, bool SaveOutput, + uint64_t &ReqPtrArgOffset); #ifdef __cplusplus } diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h index 2272577684f0c..0c751cd36bfd2 100644 --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -20,6 +20,7 @@ #include "omptarget.h" +#include #include #include #include @@ -74,7 +75,7 @@ struct RTLInfoTy { typedef int32_t(data_notify_unmapped_ty)(int32_t, void *); typedef int32_t(set_device_offset_ty)(int32_t); typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool, - bool); + bool, uint64_t &); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp index 08946e2103501..fb4db4adf0a36 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -55,6 +55,8 @@ struct RecordReplayTy { RRStatusTy Status; bool ReplaySaveOutput; + bool UsedVAMap = false; + uintptr_t MemoryOffset = 0; void *suggestAddress(uint64_t MaxMemoryAllocation) { // Get a valid pointer address for this system @@ -89,10 +91,12 @@ struct RecordReplayTy { MemoryPtr = MemoryStart; MemorySize = 0; TotalSize = ASize; + UsedVAMap = true; return Plugin::success(); } - Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) { + Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, + uint64_t RequiredMemoryAllocation, void *VAddr) { const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation; constexpr size_t STEP = 1024 * 1024 * 1024ULL; MemoryStart = nullptr; @@ -102,32 +106,55 @@ struct RecordReplayTy { if (MemoryStart) break; } - - INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), - "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize, - MemoryStart); - if (!MemoryStart) return Plugin::error("Allocating record/replay memory"); if (VAddr && VAddr != MemoryStart) - return Plugin::error("Cannot allocate recorded address"); + MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart); MemoryPtr = MemoryStart; MemorySize = 0; + // Check if we need adjustment. + if (MemoryOffset > 0 && + TotalSize >= RequiredMemoryAllocation + MemoryOffset) { + // If we are off but "before" the required address and with enough space, + // we just "allocate" the offset to match the required address. + MemoryPtr = (char *)MemoryPtr + MemoryOffset; + MemorySize += MemoryOffset; + MemoryOffset = 0; + assert(MemoryPtr == VAddr && "Expected offset adjustment to work"); + } else if (MemoryOffset) { + // If we are off and in a situation we cannot just "waste" memory to force + // a match, we hope adjusting the arguments is sufficient. + REPORT( + "WARNING Failed to allocate replay memory at required location %p, " + "got %p, trying to offset argument pointers by %" PRIi64 "\n", + VAddr, MemoryStart, MemoryOffset); + } + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(), + "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize, + MemoryStart); + return Plugin::success(); } Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) { - if (Device->supportVAManagement()) - return preAllocateVAMemory(DeviceMemorySize, ReqVAddr); + if (Device->supportVAManagement()) { + auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr); + if (Err) { + REPORT("WARNING VA mapping failed, fallback to heuristic: " + "(Error: %s)\n", + toString(std::move(Err)).data()); + } + } uint64_t DevMemSize; if (Device->getDeviceMemorySize(DevMemSize)) return Plugin::error("Cannot determine Device Memory Size"); - return preAllocateHeuristic(DevMemSize, ReqVAddr); + return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr); } void dumpDeviceMemory(StringRef Filename) { @@ -293,7 +320,7 @@ struct RecordReplayTy { } Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr, - RRStatusTy Status, bool SaveOutput) { + RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) { this->Device = Device; this->Status = Status; this->ReplaySaveOutput = SaveOutput; @@ -308,11 +335,14 @@ struct RecordReplayTy { MemoryStart, TotalSize, Status == RRStatusTy::RRRecording ? "Recording" : "Replaying"); + // Tell the user to offset pointer arguments as the memory allocation does + // not match. + ReqPtrArgOffset = MemoryOffset; return Plugin::success(); } void deinit() { - if (Device->supportVAManagement()) { + if (UsedVAMap) { if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize)) report_fatal_error("Error on releasing virtual memory space"); } else { @@ -1694,15 +1724,16 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId, int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize, void *VAddr, bool isRecord, - bool SaveOutput) { + bool SaveOutput, + uint64_t &ReqPtrArgOffset) { GenericPluginTy &Plugin = Plugin::get(); GenericDeviceTy &Device = Plugin.getDevice(DeviceId); RecordReplayTy::RRStatusTy Status = isRecord ? RecordReplayTy::RRStatusTy::RRRecording : RecordReplayTy::RRStatusTy::RRReplaying; - if (auto Err = - RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) { + if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status, + SaveOutput, ReqPtrArgOffset)) { REPORT("WARNING RR did not intialize RR-properly with %lu bytes" "(Error: %s)\n", MemorySize, toString(std::move(Err)).data()); diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 8a2fe4620b39c..da167845ccb06 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -539,15 +539,10 @@ void DeviceTy::init() { // Enables saving the device memory kernel output post execution if set. llvm::omp::target::BoolEnvar OMPX_ReplaySaveOutput( "LIBOMPTARGET_RR_SAVE_OUTPUT", false); - // Sets the maximum to pre-allocate device memory. - llvm::omp::target::UInt64Envar OMPX_DeviceMemorySize( - "LIBOMPTARGET_RR_DEVMEM_SIZE", 16); - DP("Activating Record-Replay for Device %d with %lu GB memory\n", - RTLDeviceID, OMPX_DeviceMemorySize.get()); - - RTL->activate_record_replay(RTLDeviceID, - OMPX_DeviceMemorySize * 1024 * 1024 * 1024, - nullptr, true, OMPX_ReplaySaveOutput); + + uint64_t ReqPtrArgOffset; + RTL->activate_record_replay(RTLDeviceID, 0, nullptr, true, + OMPX_ReplaySaveOutput, ReqPtrArgOffset); } IsInit = true; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index e9ab7f05c7a0a..1e6bfec012f3d 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -21,6 +21,7 @@ #include "Utilities.h" #include +#include #include #include #include @@ -347,15 +348,16 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, /// execution on persistent storage EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr, bool IsRecord, - bool SaveOutput) { + bool SaveOutput, + uint64_t &ReqPtrArgOffset) { if (!deviceIsReady(DeviceId)) { DP("Device %" PRId64 " is not ready\n", DeviceId); return OMP_TGT_FAIL; } DeviceTy &Device = *PM->Devices[DeviceId]; - [[maybe_unused]] int Rc = - target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput); + [[maybe_unused]] int Rc = target_activate_rr( + Device, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset); assert(Rc == OFFLOAD_SUCCESS && "__tgt_activate_record_replay unexpected failure!"); return OMP_TGT_SUCCESS; diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 6c59bc1cf38a8..0da448fdbefa4 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -1725,9 +1725,11 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, /// and informing the record-replayer of whether to store the output /// in some file. int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr, - bool isRecord, bool SaveOutput) { + bool isRecord, bool SaveOutput, + uint64_t &ReqPtrArgOffset) { return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr, - isRecord, SaveOutput); + isRecord, SaveOutput, + ReqPtrArgOffset); } /// Executes a kernel using pre-recorded information for loading to diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index 2a06bdbd1b708..3eb500cbd4c97 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -42,7 +42,8 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo); extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, - void *ReqAddr, bool isRecord, bool SaveOutput); + void *ReqAddr, bool isRecord, bool SaveOutput, + uint64_t &ReqPtrArgOffset); extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, void *DeviceMemory, int64_t DeviceMemorySize, diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp index 254be7db6e01a..67304fdca61d4 100644 --- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp +++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp @@ -16,6 +16,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/JSON.h" #include "llvm/Support/MemoryBuffer.h" +#include #include using namespace llvm; @@ -128,8 +129,9 @@ int main(int argc, char **argv) { __tgt_register_lib(&Desc); + uint64_t ReqPtrArgOffset = 0; int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart, - false, VerifyOpt); + false, VerifyOpt, ReqPtrArgOffset); if (Rc != OMP_TGT_SUCCESS) { report_fatal_error("Cannot activate record replay\n"); @@ -149,6 +151,18 @@ int main(int argc, char **argv) { const_cast(DeviceMemoryMB.get()->getBuffer().data()), DeviceMemoryMB.get()->getBufferSize()); + // If necessary, adjust pointer arguments. + if (ReqPtrArgOffset) { + for (auto *&Arg : TgtArgs) { + auto ArgInt = uintptr_t(Arg); + // Try to find pointer arguments. + if (ArgInt < uintptr_t(BAllocStart) || + ArgInt >= uintptr_t(BAllocStart) + DeviceMemorySize) + continue; + Arg = reinterpret_cast(ArgInt - ReqPtrArgOffset); + } + } + __tgt_target_kernel_replay( /* Loc */ nullptr, DeviceId, KernelEntry.addr, (char *)recored_data, DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),