Skip to content

Commit

Permalink
[libomptarget][OpenMP] Initial implementation of omp_target_memset() …
Browse files Browse the repository at this point in the history
…and omp_target_memset_async() (#68706)

Implement a slow-path version of omp_target_memset*() 

There is a TODO to implement a fast path that uses an on-device
kernel instead of the host-based memory fill operation.  This may
require some additional plumbing to have kernels in libomptarget.so
  • Loading branch information
mjklemm committed Oct 19, 2023
1 parent 970e745 commit f93a697
Show file tree
Hide file tree
Showing 10 changed files with 220 additions and 24 deletions.
1 change: 1 addition & 0 deletions openmp/libomptarget/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
const size_t *DstDimensions,
const size_t *SrcDimensions, int DstDevice,
int SrcDevice);
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
size_t Size, size_t DeviceOffset, int DeviceNum);
int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
Expand Down
132 changes: 108 additions & 24 deletions openmp/libomptarget/src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
}

// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
static int libomp_target_memcpy_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
if (Task == nullptr)
return OFFLOAD_FAIL;

Expand Down Expand Up @@ -241,47 +241,129 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
return Rc;
}

// Allocate and launch helper task
static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
int DepObjCount,
omp_depend_t *DepObjList) {
static int libomp_target_memset_async_task(kmp_int32 Gtid, kmp_task_t *Task) {
if (!Task)
return OFFLOAD_FAIL;

auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
if (!Args)
return OFFLOAD_FAIL;

// call omp_target_memset()
omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);

delete Args;

return OFFLOAD_SUCCESS;
}

static inline void
convertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
omp_depend_t *DepObjList) {
for (int i = 0; i < DepObjCount; ++i) {
omp_depend_t DepObj = DepObjList[i];
Vec.push_back(*((kmp_depend_info_t *)DepObj));
}
}

template <class T>
static inline int
libomp_helper_task_creation(T *Args, int (*Fn)(kmp_int32, kmp_task_t *),
int DepObjCount, omp_depend_t *DepObjList) {
// Create global thread ID
int Gtid = __kmpc_global_thread_num(nullptr);
int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;

// Setup the hidden helper flags;
// Setup the hidden helper flags
kmp_int32 Flags = 0;
kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
InputFlags->hidden_helper = 1;

// Alloc helper task
kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
sizeof(kmp_task_t), 0, Fn, -1);

if (Ptr == nullptr) {
// Task allocation failed, delete the argument object
// Alloc the helper task
kmp_task_t *Task = __kmpc_omp_target_task_alloc(
nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
if (!Task) {
delete Args;

return OFFLOAD_FAIL;
}

// Setup the arguments passed to helper task
Ptr->shareds = Args;
// Setup the arguments for the helper task
Task->shareds = Args;

// Convert the type of depend objects
// Convert types of depend objects
llvm::SmallVector<kmp_depend_info_t> DepObjs;
for (int i = 0; i < DepObjCount; i++) {
omp_depend_t DepObj = DepObjList[i];
DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
}
convertDepObjVector(DepObjs, DepObjCount, DepObjList);

// Launch the helper task
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
DepObjs.data(), 0, nullptr);

return Rc;
}

EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
int DeviceNum) {
TIMESCOPE();
DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
DeviceNum, Ptr, NumBytes);

// Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
// of unspecified behavior, see OpenMP spec).
if (!Ptr || NumBytes == 0) {
return Ptr;
}

if (DeviceNum == omp_get_initial_device()) {
DP("filling memory on host via memset");
memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail
} else {
// TODO: replace the omp_target_memset() slow path with the fast path.
// That will require the ability to execute a kernel from within
// libomptarget.so (which we do not have at the moment).

// This is a very slow path: create a filled array on the host and upload
// it to the GPU device.
int InitialDevice = omp_get_initial_device();
void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
if (Shadow) {
(void)memset(Shadow, ByteVal, NumBytes);
(void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
InitialDevice);
(void)omp_target_free(Shadow, InitialDevice);
} else {
// If the omp_target_alloc has failed, let's just not do anything.
// omp_target_memset does not have any good way to fail, so we
// simply avoid a catastrophic failure of the process for now.
DP("omp_target_memset failed to fill memory due to error with "
"omp_target_alloc");
}
}

DP("omp_target_memset returns %p\n", Ptr);
return Ptr;
}

EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes,
int DeviceNum, int DepObjCount,
omp_depend_t *DepObjList) {
DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
DeviceNum, Ptr, NumBytes);

// Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
// of unspecified behavior, see OpenMP spec).
if (!Ptr || NumBytes == 0)
return Ptr;

// Create the task object to deal with the async invocation
auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum};

// omp_target_memset_async() cannot fail via a return code, so ignore the
// return code of the helper function
(void)libomp_helper_task_creation(Args, &libomp_target_memset_async_task,
DepObjCount, DepObjList);

return Ptr;
}

EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
Expand All @@ -302,7 +384,8 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);

// Create and launch helper task
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
DepObjCount, DepObjList);

DP("omp_target_memcpy_async returns %d\n", Rc);
return Rc;
Expand Down Expand Up @@ -399,7 +482,8 @@ EXTERN int omp_target_memcpy_rect_async(
DstDimensions, SrcDimensions, DstDevice, SrcDevice);

// Create and launch helper task
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task,
DepObjCount, DepObjList);

DP("omp_target_memcpy_rect_async returns %d\n", Rc);
return Rc;
Expand Down
2 changes: 2 additions & 0 deletions openmp/libomptarget/src/exports
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ VERS1.0 {
omp_target_memcpy_rect;
omp_target_memcpy_async;
omp_target_memcpy_rect_async;
omp_target_memset;
omp_target_memset_async;
omp_target_associate_ptr;
omp_target_disassociate_ptr;
llvm_omp_target_alloc_host;
Expand Down
11 changes: 11 additions & 0 deletions openmp/libomptarget/src/private.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,17 @@ struct TargetMemcpyArgsTy {
DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
};

struct TargetMemsetArgsTy {
// Common attributes of a memset operation
void *Ptr;
int C;
size_t N;
int DeviceNum;

// no constructors defined, because this is a PoD
};

// Invalid GTID as defined by libomp; keep in sync
#define KMP_GTID_DNE (-2)
#ifdef __cplusplus
Expand Down
45 changes: 45 additions & 0 deletions openmp/libomptarget/test/api/omp_target_memset.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// RUN: %libomptarget-compile-and-run-generic

#include "stdio.h"
#include <omp.h>
#include <stdlib.h>

int main() {
int d = omp_get_default_device();
int id = omp_get_initial_device();
int q[128], i;
void *p;
void *result;

if (d < 0 || d >= omp_get_num_devices())
d = id;

p = omp_target_alloc(130 * sizeof(int), d);
if (p == NULL)
return 0;

for (i = 0; i < 128; i++)
q[i] = i;

result = omp_target_memset(p, 0, 130 * sizeof(int), d);
if (result != p) {
abort();
}

int q2[128];
for (i = 0; i < 128; ++i)
q2[i] = i;
if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
0, NULL))
abort();

#pragma omp taskwait

for (i = 0; i < 128; ++i)
if (q2[i] != 0)
abort();

omp_target_free(p, d);

return 0;
}
2 changes: 2 additions & 0 deletions openmp/runtime/src/dllexports
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,8 @@ kmp_set_warnings_off 780
omp_target_memcpy_rect 887
omp_target_associate_ptr 888
omp_target_disassociate_ptr 889
omp_target_memset 3000
omp_target_memset_async 3001
%endif

kmp_set_disp_num_buffers 890
Expand Down
5 changes: 5 additions & 0 deletions openmp/runtime/src/include/omp.h.var
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,11 @@
extern int __KAI_KMPC_CONVENTION omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
const size_t *, const size_t *, const size_t *, const size_t *, int, int,
int, omp_depend_t *);

/* OpenMP 6.0 device memory routines */
extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);

/*!
* The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
*/
Expand Down
22 changes: 22 additions & 0 deletions openmp/runtime/src/include/omp_lib.f90.var
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,28 @@
integer (omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async

function omp_target_memset(ptr, val, count, device_num) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
type(c_ptr) :: omp_target_memset
type(c_ptr), value :: ptr
integer(c_int), value :: val
integer(c_size_t), value :: count
integer(c_int), value :: device_num
end function

function omp_target_memset_async(ptr, val, count, device_num, &
depobj_count, depobj_list) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
use omp_lib_kinds
type(c_ptr) :: omp_target_memset_async
type(c_ptr), value :: ptr
integer(c_int), value :: val
integer(c_size_t), value :: count
integer(c_int), value :: device_num
integer(c_int), value :: depobj_count
integer(omp_depend_kind), optional :: depobj_list(*)
end function

function omp_target_associate_ptr(host_ptr, device_ptr, size, &
device_offset, device_num) bind(c)
use omp_lib_kinds
Expand Down
22 changes: 22 additions & 0 deletions openmp/runtime/src/include/omp_lib.h.var
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,28 @@
integer(omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async

function omp_target_memset(ptr, val, count, device_num) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
type(c_ptr) :: omp_target_memset
type(c_ptr), value :: ptr
integer(c_int), value :: val
integer(c_size_t), value :: count
integer(c_int), value :: device_num
end function

function omp_target_memset_async(ptr, val, count, device_num, &
depobj_count, depobj_list) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
use omp_lib_kinds
type(c_ptr) :: omp_target_memset_async
type(c_ptr), value :: ptr
integer(c_int), value :: val
integer(c_size_t), value :: count
integer(c_int), value :: device_num
integer(c_int), value :: depobj_count
integer(omp_depend_kind), optional :: depobj_list(*)
end function

function omp_target_associate_ptr(host_ptr, device_ptr, size, &
& device_offset, device_num) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
Expand Down
2 changes: 2 additions & 0 deletions openmp/runtime/src/kmp_ftn_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@
#define FTN_TARGET_IS_PRESENT omp_target_is_present
#define FTN_TARGET_MEMCPY omp_target_memcpy
#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
#define FTN_TARGET_MEMSET omp_target_memset
#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
#endif
Expand Down

0 comments on commit f93a697

Please sign in to comment.