Skip to content

Commit

Permalink
[OpenMP] asynchronous memory copy support
Browse files Browse the repository at this point in the history
We introduced the implementation of supporting asynchronous routines with depend objects specified in Version 5.1 of the OpenMP Application Programming Interface. In brief, these routines omp_target_memcpy_async and omp_target_memcpy_rect_async perform asynchronous (nonblocking) memory copies between any
combination of host and device pointers. The basic idea is to create the implicit tasks to carry the memory copy calls and handle dependencies specified by depend objects. The implicit tasks are executed via hidden helper thread in OpenMP runtime.

Reviewed By: jdoerfert, tianshilei1992
Committed By: jplehr

Differential Revision: https://reviews.llvm.org/D136103
  • Loading branch information
Jisheng Zhao authored and jplehr committed Mar 30, 2023
1 parent f2696e4 commit 4753a4e
Show file tree
Hide file tree
Showing 8 changed files with 540 additions and 24 deletions.
24 changes: 0 additions & 24 deletions openmp/libomptarget/include/interop.h
Expand Up @@ -116,30 +116,6 @@ omp_get_interop_type_desc(const omp_interop_t, omp_interop_property_t);
extern const char *__KAI_KMPC_CONVENTION
omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);

typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
/* Compiler flags */ /* Total compiler flags must be 16 bits */
unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
unsigned final : 1; /* task is final(1) so execute immediately */
unsigned merged_if0 : 1; // no __kmpc_task_{begin/complete}_if0 calls in if0
unsigned destructors_thunk : 1; // set if the compiler creates a thunk to
unsigned proxy : 1; // task is a proxy task (it will be executed outside the
unsigned priority_specified : 1; // set if the compiler provides priority
unsigned detachable : 1; // 1 == can detach */
unsigned unshackled : 1; /* 1 == unshackled task */
unsigned target : 1; /* 1 == target task */
unsigned reserved : 7; /* reserved for compiler use */
unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
unsigned tasking_ser : 1; // all tasks in team are either executed immediately
unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
unsigned started : 1; /* 1==started, 0==not started */
unsigned executing : 1; /* 1==executing, 0==not executing */
unsigned complete : 1; /* 1==complete, 0==not complete */
unsigned freed : 1; /* 1==freed, 0==allocated */
unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
unsigned reserved31 : 7; /* reserved for library use */
} kmp_tasking_flags_t;

typedef enum omp_interop_backend_type_t {
// reserve 0
omp_interop_backend_type_cuda_1 = 1,
Expand Down
138 changes: 138 additions & 0 deletions openmp/libomptarget/src/api.cpp
Expand Up @@ -15,6 +15,8 @@
#include "private.h"
#include "rtl.h"

#include "llvm/ADT/SmallVector.h"

#include <climits>
#include <cstdlib>
#include <cstring>
Expand Down Expand Up @@ -207,6 +209,105 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
return Rc;
}

// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
if (Task == nullptr)
return OFFLOAD_FAIL;

TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds;

if (Args == nullptr)
return OFFLOAD_FAIL;

// Call blocked version
int Rc = OFFLOAD_SUCCESS;
if (Args->IsRectMemcpy) {
Rc = omp_target_memcpy_rect(
Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume,
Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions,
Args->SrcDimensions, Args->DstDevice, Args->SrcDevice);

DP("omp_target_memcpy_rect returns %d\n", Rc);
} else {
Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset,
Args->SrcOffset, Args->DstDevice, Args->SrcDevice);

DP("omp_target_memcpy returns %d\n", Rc);
}

// Release the arguments object
delete Args;

return Rc;
}

// Allocate and launch helper task
static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
int DepObjCount,
omp_depend_t *DepObjList) {
// Create global thread ID
int Gtid = __kmpc_global_thread_num(nullptr);
int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;

// Setup the hidden helper flags;
kmp_int32 Flags = 0;
kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
InputFlags->hidden_helper = 1;

// Alloc helper task
kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
sizeof(kmp_task_t), 0, Fn, -1);

if (Ptr == nullptr) {
// Task allocation failed, delete the argument object
delete Args;

return OFFLOAD_FAIL;
}

// Setup the arguments passed to helper task
Ptr->shareds = Args;

// Convert the type of depend objects
llvm::SmallVector<kmp_depend_info_t> DepObjs;
for (int i = 0; i < DepObjCount; i++) {
omp_depend_t DepObj = DepObjList[i];
DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
}

// Launch the helper task
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
DepObjs.data(), 0, nullptr);

return Rc;
}

EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
int DepObjCount, omp_depend_t *DepObjList) {
TIMESCOPE();
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
Length);

// Check the source and dest address
if (Dst == nullptr || Src == nullptr)
return OFFLOAD_FAIL;

// Create task object
TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);

// Create and launch helper task
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);

DP("omp_target_memcpy_async returns %d\n", Rc);
return Rc;
}

EXTERN int
omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
int NumDims, const size_t *Volume,
Expand Down Expand Up @@ -267,6 +368,43 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
return Rc;
}

EXTERN int omp_target_memcpy_rect_async(
void *Dst, const void *Src, size_t ElementSize, int NumDims,
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
TIMESCOPE();
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
"volume " DPxMOD ", element size %zu, num_dims %d\n",
DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
DPxPTR(Volume), ElementSize, NumDims);

// Need to check this first to not return OFFLOAD_FAIL instead
if (!Dst && !Src) {
DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
INT_MAX);
return INT_MAX;
}

// Check the source and dest address
if (Dst == nullptr || Src == nullptr)
return OFFLOAD_FAIL;

// Create task object
TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets,
DstDimensions, SrcDimensions, DstDevice, SrcDevice);

// Create and launch helper task
int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);

DP("omp_target_memcpy_rect_async returns %d\n", Rc);
return Rc;
}

EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
size_t Size, size_t DeviceOffset,
int DeviceNum) {
Expand Down
2 changes: 2 additions & 0 deletions openmp/libomptarget/src/exports
Expand Up @@ -41,6 +41,8 @@ VERS1.0 {
omp_target_is_present;
omp_target_memcpy;
omp_target_memcpy_rect;
omp_target_memcpy_async;
omp_target_memcpy_rect_async;
omp_target_associate_ptr;
omp_target_disassociate_ptr;
llvm_omp_target_alloc_host;
Expand Down
124 changes: 124 additions & 0 deletions openmp/libomptarget/src/private.h
Expand Up @@ -104,7 +104,51 @@ extern "C" {
* We maintain the same data structure for compatibility.
*/
typedef int kmp_int32;
typedef int64_t kmp_int64;
typedef intptr_t kmp_intptr_t;

typedef void *omp_depend_t;
struct kmp_task;
typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
typedef struct kmp_task {
void *shareds;
kmp_routine_entry_t routine;
kmp_int32 part_id;
} kmp_task_t;

typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
/* Compiler flags */ /* Total compiler flags must be 16 bits */
unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
unsigned final : 1; /* task is final(1) so execute immediately */
unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
code path */
unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
invoke destructors from the runtime */
unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
context of the RTL) */
unsigned priority_specified : 1; /* set if the compiler provides priority
setting for the task */
unsigned detachable : 1; /* 1 == can detach */
unsigned hidden_helper : 1; /* 1 == hidden helper task */
unsigned reserved : 8; /* reserved for compiler use */

/* Library flags */ /* Total library flags must be 16 bits */
unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
unsigned tasking_ser : 1; // all tasks in team are either executed immediately
// (1) or may be deferred (0)
unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
// (0) [>= 2 threads]
/* If either team_serial or tasking_ser is set, task team may be NULL */
/* Task State Flags: */
unsigned started : 1; /* 1==started, 0==not started */
unsigned executing : 1; /* 1==executing, 0==not executing */
unsigned complete : 1; /* 1==complete, 0==not complete */
unsigned freed : 1; /* 1==freed, 0==allocated */
unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
unsigned reserved31 : 7; /* reserved for library use */
} kmp_tasking_flags_t;

// Compiler sends us this info:
typedef struct kmp_depend_info {
kmp_intptr_t base_addr;
Expand All @@ -126,6 +170,86 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
__attribute__((weak));
bool __kmpc_omp_has_task_team(kmp_int32 gtid) __attribute__((weak));
kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
kmp_int32 flags, size_t sizeof_kmp_task_t,
size_t sizeof_shareds,
kmp_routine_entry_t task_entry)
__attribute__((weak));

kmp_task_t *
__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
size_t sizeof_kmp_task_t, size_t sizeof_shareds,
kmp_routine_entry_t task_entry,
kmp_int64 device_id) __attribute__((weak));

kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *new_task, kmp_int32 ndeps,
kmp_depend_info_t *dep_list,
kmp_int32 ndeps_noalias,
kmp_depend_info_t *noalias_dep_list)
__attribute__((weak));

/**
* The argument set that is passed from asynchronous memory copy to block
* version of memory copy invoked in helper task
*/
struct TargetMemcpyArgsTy {
/**
* Common attribuutes
*/
void *Dst;
const void *Src;
int DstDevice;
int SrcDevice;

/**
* The flag that denotes single dimensional or rectangle dimensional copy
*/
bool IsRectMemcpy;

/**
* Arguments for single dimensional copy
*/
size_t Length;
size_t DstOffset;
size_t SrcOffset;

/**
* Arguments for rectangle dimensional copy
*/
size_t ElementSize;
int NumDims;
const size_t *Volume;
const size_t *DstOffsets;
const size_t *SrcOffsets;
const size_t *DstDimensions;
const size_t *SrcDimensions;

/**
* Constructor for single dimensional copy
*/
TargetMemcpyArgsTy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice)
: Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
IsRectMemcpy(false), Length(Length), DstOffset(DstOffset),
SrcOffset(SrcOffset), ElementSize(0), NumDims(0), Volume(0),
DstOffsets(0), SrcOffsets(0), DstDimensions(0), SrcDimensions(0){};

/**
* Constructor for rectangle dimensional copy
*/
TargetMemcpyArgsTy(void *Dst, const void *Src, size_t ElementSize,
int NumDims, const size_t *Volume,
const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions,
int DstDevice, int SrcDevice)
: Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
IsRectMemcpy(true), Length(0), DstOffset(0), SrcOffset(0),
ElementSize(ElementSize), NumDims(NumDims), Volume(Volume),
DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
};
// Invalid GTID as defined by libomp; keep in sync
#define KMP_GTID_DNE (-2)
#ifdef __cplusplus
Expand Down
48 changes: 48 additions & 0 deletions openmp/libomptarget/test/api/omp_target_memcpy_async1.c
@@ -0,0 +1,48 @@
// RUN: %libomptarget-compile-and-run-generic

// Test case for omp_target_memcpy_async, oringally from GCC

#include "stdio.h"
#include <omp.h>
#include <stdlib.h>

int main() {
int d = omp_get_default_device();
int id = omp_get_initial_device();
int q[128], i;
void *p;

if (d < 0 || d >= omp_get_num_devices())
d = id;

p = omp_target_alloc(130 * sizeof(int), d);
if (p == NULL)
return 0;

for (i = 0; i < 128; i++)
q[i] = i;

if (omp_target_memcpy_async(p, q, 128 * sizeof(int), sizeof(int), 0, d, id, 0,
NULL)) {
abort();
}

#pragma omp taskwait

int q2[128];
for (i = 0; i < 128; ++i)
q2[i] = 0;
if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
0, NULL))
abort();

#pragma omp taskwait

for (i = 0; i < 128; ++i)
if (q2[i] != q[i])
abort();

omp_target_free(p, d);

return 0;
}

0 comments on commit 4753a4e

Please sign in to comment.