Skip to content

Commit

Permalink
[OpenMP][libomptarget] Add data sharing support in libomptarget
Browse files Browse the repository at this point in the history
Summary: This patch extends the libomptarget functionality in patch D14254 with support for the data sharing scheme for supporting implicitly shared variables. The runtime therefore maintains a list of references to shared variables.

Reviewers: carlo.bertolli, ABataev, Hahnfeld, grokos, caomhin, hfinkel

Reviewed By: Hahnfeld, grokos

Subscribers: guansong, llvm-commits, openmp-commits

Differential Revision: https://reviews.llvm.org/D41485

llvm-svn: 324495
  • Loading branch information
doru1004 committed Feb 7, 2018
1 parent f4e3f3e commit aaeab8d
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 1 deletion.
3 changes: 2 additions & 1 deletion openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
Expand Up @@ -470,8 +470,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
int16_t RequiresDataSharing);
EXTERN void __kmpc_spmd_kernel_deinit();
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
void ***SharedArgs, int32_t nArgs,
int16_t IsOMPRuntimeInitialized);
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
EXTERN bool __kmpc_kernel_parallel(void **WorkFn, void ***SharedArgs,
int16_t IsOMPRuntimeInitialized);
EXTERN void __kmpc_kernel_end_parallel();
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
Expand Down
5 changes: 5 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
Expand Up @@ -46,3 +46,8 @@ __device__ __shared__ DataSharingStateTy DataSharingState;
// Scratchpad for teams reduction.
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ void *ReductionScratchpadPtr;

////////////////////////////////////////////////////////////////////////////////
// Data sharing related variables.
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
6 changes: 6 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
Expand Up @@ -54,6 +54,9 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
OMPTARGET_NVPTX_VERSION);

// init parallel work arguments
omptarget_nvptx_sharedArgs.Init();

if (!RequiresOMPRuntime) {
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Generic, RuntimeUninitialized);
Expand Down Expand Up @@ -107,6 +110,9 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
}
// Done with work. Kill the workers.
omptarget_nvptx_workFn = 0;

// Deinit parallel work arguments
omptarget_nvptx_sharedArgs.DeInit();
}

EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
Expand Down
40 changes: 40 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
Expand Up @@ -62,6 +62,46 @@
#define __ACTIVEMASK() __ballot(1)
#endif

// arguments needed for L0 parallelism only.
class omptarget_nvptx_SharedArgs {
public:
// All these methods must be called by the master thread only.
INLINE void Init() {
args = buffer;
nArgs = MAX_SHARED_ARGS;
}
INLINE void DeInit() {
// Free any memory allocated for outlined parallel function with a large
// number of arguments.
if (nArgs > MAX_SHARED_ARGS) {
SafeFree(args, (char *)"new extended args");
Init();
}
}
INLINE void EnsureSize(int size) {
if (size > nArgs) {
if (nArgs > MAX_SHARED_ARGS) {
SafeFree(args, (char *)"new extended args");
}
args = (void **) SafeMalloc(size * sizeof(void *),
(char *)"new extended args");
nArgs = size;
}
}
// Called by all threads.
INLINE void **GetArgs() { return args; };
private:
// buffer of pre-allocated arguments.
void *buffer[MAX_SHARED_ARGS];
// pointer to arguments buffer.
// starts off as a pointer to 'buffer' but can be dynamically allocated.
void **args;
// starts off as MAX_SHARED_ARGS but can increase in size.
uint32_t nArgs;
};

extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;

// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The maximum number of workers in a kernel.
Expand Down
4 changes: 4 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/src/option.h
Expand Up @@ -28,6 +28,10 @@
// region to synchronize with each other.
#define L1_BARRIER (1)

// Maximum number of preallocated arguments to an outlined parallel/simd function.
// Anything more requires dynamic memory allocation.
#define MAX_SHARED_ARGS 20

// Maximum number of omp state objects per SM allocated statically in global
// memory.
#if __CUDA_ARCH__ >= 600
Expand Down
8 changes: 8 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Expand Up @@ -214,10 +214,16 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
//
// This routine is always called by the team master..
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
void ***SharedArgs, int32_t nArgs,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
omptarget_nvptx_workFn = WorkFn;

if (nArgs > 0) {
omptarget_nvptx_sharedArgs.EnsureSize(nArgs);
*SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
}

if (!IsOMPRuntimeInitialized)
return;

Expand Down Expand Up @@ -317,11 +323,13 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
//
// Only the worker threads call this routine.
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
void ***SharedArgs,
int16_t IsOMPRuntimeInitialized) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");

// Work function and arguments for L1 parallel region.
*WorkFn = omptarget_nvptx_workFn;
*SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();

if (!IsOMPRuntimeInitialized)
return true;
Expand Down

0 comments on commit aaeab8d

Please sign in to comment.