Skip to content

Commit

Permalink
[OpenMP] Be more forgiving during record and replay
Browse files Browse the repository at this point in the history
When we record and replay kernels we should not error out early if there
is a chance the program might still run fine. This patch will:
1) Fallback to the allocation heuristic if the VAMap doesn't work.
2) Adjust the memory start to match the required address if possible.
3) Adjust the (guessed) pointer arguments if the memory start adjustment
   is impossible. This will allow kernels without indirect accesses to
   work while indirect accesses will most likely fail.
  • Loading branch information
jdoerfert committed Nov 21, 2023
1 parent 41566fb commit f48c4d8
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 33 deletions.
3 changes: 2 additions & 1 deletion openmp/libomptarget/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,8 @@ void __tgt_set_info_flag(uint32_t);
int __tgt_print_device_info(int64_t DeviceId);

int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
void *VAddr, bool IsRecord, bool SaveOutput);
void *VAddr, bool IsRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset);

#ifdef __cplusplus
}
Expand Down
3 changes: 2 additions & 1 deletion openmp/libomptarget/include/rtl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "omptarget.h"

#include <cstdint>
#include <list>
#include <map>
#include <mutex>
Expand Down Expand Up @@ -74,7 +75,7 @@ struct RTLInfoTy {
typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
typedef int32_t(set_device_offset_ty)(int32_t);
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
bool);
bool, uint64_t &);

int32_t Idx = -1; // RTL index, index is the number of devices
// of other RTLs that were registered before,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct RecordReplayTy {

RRStatusTy Status;
bool ReplaySaveOutput;
bool UsedVAMap = false;
uintptr_t MemoryOffset = 0;

void *suggestAddress(uint64_t MaxMemoryAllocation) {
// Get a valid pointer address for this system
Expand Down Expand Up @@ -89,10 +91,12 @@ struct RecordReplayTy {
MemoryPtr = MemoryStart;
MemorySize = 0;
TotalSize = ASize;
UsedVAMap = true;
return Plugin::success();
}

Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
uint64_t RequiredMemoryAllocation, void *VAddr) {
const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
MemoryStart = nullptr;
Expand All @@ -102,32 +106,55 @@ struct RecordReplayTy {
if (MemoryStart)
break;
}

INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
MemoryStart);

if (!MemoryStart)
return Plugin::error("Allocating record/replay memory");

if (VAddr && VAddr != MemoryStart)
return Plugin::error("Cannot allocate recorded address");
MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);

MemoryPtr = MemoryStart;
MemorySize = 0;

// Check if we need adjustment.
if (MemoryOffset > 0 &&
TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
// If we are off but "before" the required address and with enough space,
// we just "allocate" the offset to match the required address.
MemoryPtr = (char *)MemoryPtr + MemoryOffset;
MemorySize += MemoryOffset;
MemoryOffset = 0;
assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
} else if (MemoryOffset) {
// If we are off and in a situation we cannot just "waste" memory to force
// a match, we hope adjusting the arguments is sufficient.
REPORT(
"WARNING Failed to allocate replay memory at required location %p, "
"got %p, trying to offset argument pointers by %" PRIi64 "\n",
VAddr, MemoryStart, MemoryOffset);
}

INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
MemoryStart);

return Plugin::success();
}

Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
if (Device->supportVAManagement())
return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
if (Device->supportVAManagement()) {
auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
if (Err) {
REPORT("WARNING VA mapping failed, fallback to heuristic: "
"(Error: %s)\n",
toString(std::move(Err)).data());
}
}

uint64_t DevMemSize;
if (Device->getDeviceMemorySize(DevMemSize))
return Plugin::error("Cannot determine Device Memory Size");

return preAllocateHeuristic(DevMemSize, ReqVAddr);
return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr);
}

void dumpDeviceMemory(StringRef Filename) {
Expand Down Expand Up @@ -293,7 +320,7 @@ struct RecordReplayTy {
}

Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
RRStatusTy Status, bool SaveOutput) {
RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
this->Device = Device;
this->Status = Status;
this->ReplaySaveOutput = SaveOutput;
Expand All @@ -308,11 +335,14 @@ struct RecordReplayTy {
MemoryStart, TotalSize,
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");

// Tell the user to offset pointer arguments as the memory allocation does
// not match.
ReqPtrArgOffset = MemoryOffset;
return Plugin::success();
}

void deinit() {
if (Device->supportVAManagement()) {
if (UsedVAMap) {
if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
report_fatal_error("Error on releasing virtual memory space");
} else {
Expand Down Expand Up @@ -1694,15 +1724,16 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,

int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
void *VAddr, bool isRecord,
bool SaveOutput) {
bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
GenericPluginTy &Plugin = Plugin::get();
GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
RecordReplayTy::RRStatusTy Status =
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
: RecordReplayTy::RRStatusTy::RRReplaying;

if (auto Err =
RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status,
SaveOutput, ReqPtrArgOffset)) {
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
"(Error: %s)\n",
MemorySize, toString(std::move(Err)).data());
Expand Down
13 changes: 4 additions & 9 deletions openmp/libomptarget/src/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,15 +539,10 @@ void DeviceTy::init() {
// Enables saving the device memory kernel output post execution if set.
llvm::omp::target::BoolEnvar OMPX_ReplaySaveOutput(
"LIBOMPTARGET_RR_SAVE_OUTPUT", false);
// Sets the maximum to pre-allocate device memory.
llvm::omp::target::UInt64Envar OMPX_DeviceMemorySize(
"LIBOMPTARGET_RR_DEVMEM_SIZE", 16);
DP("Activating Record-Replay for Device %d with %lu GB memory\n",
RTLDeviceID, OMPX_DeviceMemorySize.get());

RTL->activate_record_replay(RTLDeviceID,
OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
nullptr, true, OMPX_ReplaySaveOutput);

uint64_t ReqPtrArgOffset;
RTL->activate_record_replay(RTLDeviceID, 0, nullptr, true,
OMPX_ReplaySaveOutput, ReqPtrArgOffset);
}

IsInit = true;
Expand Down
8 changes: 5 additions & 3 deletions openmp/libomptarget/src/interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "Utilities.h"

#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <mutex>
Expand Down Expand Up @@ -347,15 +348,16 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
/// execution on persistent storage
EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
void *VAddr, bool IsRecord,
bool SaveOutput) {
bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
if (!deviceIsReady(DeviceId)) {
DP("Device %" PRId64 " is not ready\n", DeviceId);
return OMP_TGT_FAIL;
}

DeviceTy &Device = *PM->Devices[DeviceId];
[[maybe_unused]] int Rc =
target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
[[maybe_unused]] int Rc = target_activate_rr(
Device, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset);
assert(Rc == OFFLOAD_SUCCESS &&
"__tgt_activate_record_replay unexpected failure!");
return OMP_TGT_SUCCESS;
Expand Down
6 changes: 4 additions & 2 deletions openmp/libomptarget/src/omptarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1725,9 +1725,11 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
/// and informing the record-replayer of whether to store the output
/// in some file.
int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
bool isRecord, bool SaveOutput) {
bool isRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
isRecord, SaveOutput);
isRecord, SaveOutput,
ReqPtrArgOffset);
}

/// Executes a kernel using pre-recorded information for loading to
Expand Down
3 changes: 2 additions & 1 deletion openmp/libomptarget/src/private.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);

extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
void *ReqAddr, bool isRecord, bool SaveOutput);
void *ReqAddr, bool isRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset);

extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
void *DeviceMemory, int64_t DeviceMemorySize,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/JSON.h"
#include "llvm/Support/MemoryBuffer.h"
#include <cstdint>
#include <cstdlib>

using namespace llvm;
Expand Down Expand Up @@ -128,8 +129,9 @@ int main(int argc, char **argv) {

__tgt_register_lib(&Desc);

uint64_t ReqPtrArgOffset = 0;
int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
false, VerifyOpt);
false, VerifyOpt, ReqPtrArgOffset);

if (Rc != OMP_TGT_SUCCESS) {
report_fatal_error("Cannot activate record replay\n");
Expand All @@ -149,6 +151,18 @@ int main(int argc, char **argv) {
const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
DeviceMemoryMB.get()->getBufferSize());

// If necessary, adjust pointer arguments.
if (ReqPtrArgOffset) {
for (auto *&Arg : TgtArgs) {
auto ArgInt = uintptr_t(Arg);
// Try to find pointer arguments.
if (ArgInt < uintptr_t(BAllocStart) ||
ArgInt >= uintptr_t(BAllocStart) + DeviceMemorySize)
continue;
Arg = reinterpret_cast<void *>(ArgInt - ReqPtrArgOffset);
}
}

__tgt_target_kernel_replay(
/* Loc */ nullptr, DeviceId, KernelEntry.addr, (char *)recored_data,
DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),
Expand Down

0 comments on commit f48c4d8

Please sign in to comment.