Skip to content

Commit

Permalink
[OpenMP] Support for OpenMP-Offload Record Replay
Browse files Browse the repository at this point in the history
Enable record-replay for OpenMP offload kernels.  On recording the initialization
is performed on device initialization by reading env variables. (This is similar to
the way rr used to operate). The primary change takes place in the replay phase
with the replay tool explicitly initializing the record-replay functionality.

Differential Revision: https://reviews.llvm.org/D156174

Fix
  • Loading branch information
koparasy authored and koparasy committed Aug 5, 2023
1 parent b53f6ef commit 73cb01d
Show file tree
Hide file tree
Showing 12 changed files with 242 additions and 66 deletions.
4 changes: 4 additions & 0 deletions openmp/libomptarget/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,10 @@ int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
void __tgt_set_info_flag(uint32_t);

int __tgt_print_device_info(int64_t DeviceId);

int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
bool IsRecord, bool SaveOutput);

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 2 additions & 0 deletions openmp/libomptarget/include/rtl.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ struct RTLInfoTy {
typedef int32_t(data_unlock_ty)(int32_t, void *);
typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);

int32_t Idx = -1; // RTL index, index is the number of devices
// of other RTLs that were registered before,
Expand Down Expand Up @@ -124,6 +125,7 @@ struct RTLInfoTy {
data_unlock_ty *data_unlock = nullptr;
data_notify_mapped_ty *data_notify_mapped = nullptr;
data_notify_unmapped_ty *data_notify_unmapped = nullptr;
activate_record_replay_ty *activate_record_replay = nullptr;

// Are there images associated with this RTL.
bool IsUsed = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,5 +358,7 @@ bool JITEngine::checkBitcodeImage(const __tgt_device_image &Image) {
auto BitcodeTA = Triple(ActualTriple).getArch();
BitcodeImageMap[Image.ImageStart] = BitcodeTA;

DP("Is%s IR Image\n", BitcodeTA == TT.getArch() ? " " : " NOT");

return BitcodeTA == TT.getArch();
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ GenericPluginTy *Plugin::SpecificPlugin = nullptr;

// TODO: Fix any thread safety issues for multi-threaded kernel recording.
struct RecordReplayTy {

// Describes the state of the record replay mechanism.
enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };

private:
// Memory pointers for recording, replaying memory.
void *MemoryStart;
Expand All @@ -47,26 +51,19 @@ struct RecordReplayTy {
GenericDeviceTy *Device;
std::mutex AllocationLock;

// Environment variables for record and replay.
// Enables recording kernels if set.
BoolEnvar OMPX_RecordKernel;
// Enables replaying a kernel if set.
BoolEnvar OMPX_ReplayKernel;
// Enables saving the device memory kernel output post execution if set.
BoolEnvar OMPX_ReplaySaveOutput;
// Sets the maximum to pre-allocate device memory.
UInt32Envar OMPX_DeviceMemorySize;
RRStatusTy Status;
bool ReplaySaveOutput;
uint64_t DeviceMemorySize;

// Record/replay pre-allocates the largest possible device memory using the
// default kind.
// TODO: Expand allocation to include other kinds (device, host, shared) and
// possibly use a MemoryManager to track (de-)allocations for
// storing/retrieving when recording/replaying.
Error preallocateDeviceMemory() {
Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
// Pre-allocate memory on device. Starts with 64GB and subtracts in steps
// of 1GB until allocation succeeds.
const size_t MAX_MEMORY_ALLOCATION =
OMPX_DeviceMemorySize * 1024 * 1024 * 1024ULL;
const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
MemoryStart = nullptr;
for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
Expand All @@ -85,15 +82,14 @@ struct RecordReplayTy {
return Plugin::success();
}

void dumpDeviceMemory(StringRef Filename,
AsyncInfoWrapperTy &AsyncInfoWrapper) {
void dumpDeviceMemory(StringRef Filename) {
ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
if (!DeviceMemoryMB)
report_fatal_error("Error creating MemoryBuffer for device memory");

auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
MemoryStart, MemorySize, AsyncInfoWrapper);
MemoryStart, MemorySize, nullptr);
if (Err)
report_fatal_error("Error retrieving data for target pointer");

Expand All @@ -108,21 +104,19 @@ struct RecordReplayTy {
}

public:
bool isRecording() const { return OMPX_RecordKernel; }
bool isReplaying() const { return OMPX_ReplayKernel; }
bool isRecording() const { return Status == RRStatusTy::RRRecording; }
bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
bool isRecordingOrReplaying() const {
return (OMPX_RecordKernel || OMPX_ReplayKernel);
return (Status != RRStatusTy::RRDeactivated);
}
bool isSaveOutputEnabled() const { return OMPX_ReplaySaveOutput; }
void setStatus(RRStatusTy Status) { this->Status = Status; }
bool isSaveOutputEnabled() const { return ReplaySaveOutput; }

RecordReplayTy()
: OMPX_RecordKernel("LIBOMPTARGET_RECORD"),
OMPX_ReplayKernel("LIBOMPTARGET_REPLAY"),
OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT"),
OMPX_DeviceMemorySize("LIBOMPTARGET_RR_DEVMEM_SIZE",
/* Default in GB */ 64) {}
: Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
DeviceMemorySize(-1) {}

void saveImage(const char *Name, DeviceImageTy &Image) {
void saveImage(const char *Name, const DeviceImageTy &Image) {
SmallString<128> ImageName = {Name, ".image"};
std::error_code EC;
raw_fd_ostream OS(ImageName, EC);
Expand All @@ -140,11 +134,60 @@ struct RecordReplayTy {
OS.close();
}

void saveKernelInputInfo(const char *Name, void **ArgPtrs,
ptrdiff_t *ArgOffsets, int32_t NumArgs,
uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
uint64_t LoopTripCount,
AsyncInfoWrapperTy &AsyncInfoWrapper) {
void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
int32_t Size = 0;

for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
if (!OffloadEntry.size)
continue;
Size += std::strlen(OffloadEntry.name) + /* '\0' */ 1 +
/* OffloadEntry.size value */ sizeof(uint32_t) +
OffloadEntry.size;
}

ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
WritableMemoryBuffer::getNewUninitMemBuffer(Size);
if (!GlobalsMB)
report_fatal_error("Error creating MemoryBuffer for globals memory");

void *BufferPtr = GlobalsMB.get()->getBufferStart();
for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
if (!OffloadEntry.size)
continue;

int32_t NameLength = std::strlen(OffloadEntry.name) + 1;
memcpy(BufferPtr, OffloadEntry.name, NameLength);
BufferPtr = advanceVoidPtr(BufferPtr, NameLength);

*((uint32_t *)(BufferPtr)) = OffloadEntry.size;
BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));

auto Err = Plugin::success();
{
if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr,
OffloadEntry.size, nullptr))
report_fatal_error("Error retrieving data for global");
}
if (Err)
report_fatal_error("Error retrieving data for global");
BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size);
}
assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
"Buffer over/under-filled.");
assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
"Buffer size mismatch");

StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
std::error_code EC;
raw_fd_ostream OS(Filename, EC);
OS << GlobalsMemory;
OS.close();
}

void saveKernelInputInfo(const char *Name, DeviceImageTy &Image,
void **ArgPtrs, ptrdiff_t *ArgOffsets,
int32_t NumArgs, uint64_t NumTeamsClause,
uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
json::Object JsonKernelInfo;
JsonKernelInfo["Name"] = Name;
JsonKernelInfo["NumArgs"] = NumArgs;
Expand All @@ -165,7 +208,10 @@ struct RecordReplayTy {
JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));

SmallString<128> MemoryFilename = {Name, ".memory"};
dumpDeviceMemory(MemoryFilename, AsyncInfoWrapper);
dumpDeviceMemory(MemoryFilename);

SmallString<128> GlobalsFilename = {Name, ".globals"};
dumpGlobals(GlobalsFilename, Image);

SmallString<128> JsonFilename = {Name, ".json"};
std::error_code EC;
Expand All @@ -177,11 +223,10 @@ struct RecordReplayTy {
JsonOS.close();
}

void saveKernelOutputInfo(const char *Name,
AsyncInfoWrapperTy &AsyncInfoWrapper) {
void saveKernelOutputInfo(const char *Name) {
SmallString<128> OutputFilename = {
Name, (isRecording() ? ".original.output" : ".replay.output")};
dumpDeviceMemory(OutputFilename, AsyncInfoWrapper);
dumpDeviceMemory(OutputFilename);
}

void *alloc(uint64_t Size) {
Expand All @@ -194,12 +239,28 @@ struct RecordReplayTy {
Alloc = MemoryPtr;
MemoryPtr = (char *)MemoryPtr + AlignedSize;
MemorySize += AlignedSize;
DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
return Alloc;
}

Error init(GenericDeviceTy *Device) {
Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
bool SaveOutput) {
this->Device = Device;
return preallocateDeviceMemory();
this->Status = Status;
this->DeviceMemorySize = MemSize;
this->ReplaySaveOutput = SaveOutput;

if (auto Err = preallocateDeviceMemory(MemSize))
return Err;

INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Record Replay Initialized (%p)"
" as starting address, %lu Memory Size"
" and set on status %s\n",
MemoryStart, MemSize,
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");

return Plugin::success();
}

void deinit() { Device->free(MemoryStart); }
Expand Down Expand Up @@ -227,7 +288,11 @@ void AsyncInfoWrapperTy::finalize(Error &Err) {

Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
DeviceImageTy &Image) {

ImagePtr = &Image;

PreferredNumThreads = GenericDevice.getDefaultNumThreads();

MaxNumThreads = GenericDevice.getThreadLimit();

return initImpl(GenericDevice, Image);
Expand Down Expand Up @@ -468,10 +533,6 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
if (EnableMM)
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);

if (RecordReplay.isRecordingOrReplaying())
if (auto Err = RecordReplay.init(this))
return Err;

return Plugin::success();
}

Expand Down Expand Up @@ -1087,26 +1148,31 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs,
__tgt_async_info *AsyncInfo) {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
AsyncInfoWrapperTy AsyncInfoWrapper(
*this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo);

GenericKernelTy &GenericKernel =
*reinterpret_cast<GenericKernelTy *>(EntryPtr);

if (RecordReplay.isRecording())
RecordReplay.saveKernelInputInfo(
GenericKernel.getName(), ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
KernelArgs.NumTeams[0], KernelArgs.ThreadLimit[0], KernelArgs.Tripcount,
AsyncInfoWrapper);
GenericKernel.getName(), GenericKernel.getImage(), ArgPtrs, ArgOffsets,
KernelArgs.NumArgs, KernelArgs.NumTeams[0], KernelArgs.ThreadLimit[0],
KernelArgs.Tripcount);

if (RecordReplay.isRecording())
RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage());

auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
AsyncInfoWrapper);

// 'finalize' here to guarantee next record-replay actions are in-sync
AsyncInfoWrapper.finalize(Err);

if (RecordReplay.isRecordingOrReplaying() &&
RecordReplay.isSaveOutputEnabled())
RecordReplay.saveKernelOutputInfo(GenericKernel.getName(),
AsyncInfoWrapper);
RecordReplay.saveKernelOutputInfo(GenericKernel.getName());

AsyncInfoWrapper.finalize(Err);
return Err;
}

Expand Down Expand Up @@ -1358,6 +1424,28 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
}

int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
uint64_t MemorySize, bool isRecord,
bool SaveOutput) {
GenericPluginTy &Plugin = Plugin::get();
GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
RecordReplayTy::RRStatusTy Status =
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
: RecordReplayTy::RRStatusTy::RRReplaying;

if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
"(Error: %s)\n",
MemorySize, toString(std::move(Err)).data());
RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);

if (!isRecord) {
return OFFLOAD_FAIL;
}
}
return OFFLOAD_SUCCESS;
}

__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
__tgt_device_image *TgtImage) {
GenericPluginTy &Plugin = Plugin::get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ class DeviceImageTy {
private:
__tgt_target_table TTTablePtr;
llvm::SmallVector<__tgt_offload_entry> Entries;

public:
using const_iterator = decltype(Entries)::const_iterator;
const_iterator begin() const { return Entries.begin(); }
const_iterator end() const { return Entries.end(); }
};

/// Image identifier within the corresponding device. Notice that this id is
Expand Down Expand Up @@ -274,6 +279,12 @@ struct GenericKernelTy {
/// Get the kernel name.
const char *getName() const { return Name; }

/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
return *ImagePtr;
}

/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
switch (ExecutionMode) {
Expand Down Expand Up @@ -343,6 +354,9 @@ struct GenericKernelTy {
/// The execution flags of the kernel.
OMPTgtExecModeFlags ExecutionMode;

/// The image that contains this kernel.
DeviceImageTy *ImagePtr = nullptr;

protected:
/// The preferred number of threads to run the kernel.
uint32_t PreferredNumThreads;
Expand Down

0 comments on commit 73cb01d

Please sign in to comment.