Skip to content

Commit

Permalink
[OpenMP] Introduce kernel environment
Browse files Browse the repository at this point in the history
This patch introduces per kernel environment. Previously, flags such as execution mode are set through global variables with name like `__kernel_name_exec_mode`. They are accessible on the host by reading the corresponding global variable, but not from the device. Besides, some assumptions, such as no nested parallelism, are not per kernel basis, preventing us applying per kernel optimization in the device runtime.

This is a combination and refinement of patch series D116908, D116909, and D116910.

Depend on D155886.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D142569
  • Loading branch information
shiltian committed Jul 23, 2023
1 parent 14b466b commit c5c8040
Show file tree
Hide file tree
Showing 88 changed files with 8,846 additions and 3,492 deletions.
22 changes: 1 addition & 21 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Expand Up @@ -795,7 +795,7 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
emitGenericVarsEpilog(CGF);

CGBuilderTy &Bld = CGF.Builder;
OMPBuilder.createTargetDeinit(Bld, IsSPMD);
OMPBuilder.createTargetDeinit(Bld);
}

void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
Expand Down Expand Up @@ -833,24 +833,6 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
IsInTTDRegion = false;
}

// Create a unique global variable to indicate the execution mode of this target
// region. The execution mode is either 'generic', or 'spmd' depending on the
// target directive. This variable is picked up by the offload library to setup
// the device appropriately before kernel launch. If the execution mode is
// 'generic', the runtime reserves one warp for the master, otherwise, all
// warps participate in parallel work.
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
bool Mode) {
auto *GVMode = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
llvm::GlobalValue::WeakAnyLinkage,
llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
: OMP_TGT_EXEC_MODE_GENERIC),
Twine(Name, "_exec_mode"));
GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility);
CGM.addCompilerUsedGlobal(GVMode);
}

void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Expand All @@ -867,8 +849,6 @@ void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
else
emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);

setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
}

CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
Expand Down
8 changes: 4 additions & 4 deletions clang/test/OpenMP/amdgcn_target_codegen.cpp
Expand Up @@ -37,7 +37,7 @@ int test_amdgcn_target_tid_threads_simd() {
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
// CHECK-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14_kernel_environment to ptr))
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
Expand All @@ -61,7 +61,7 @@ int test_amdgcn_target_tid_threads_simd() {
// CHECK: worker.exit:
// CHECK-NEXT: ret void
// CHECK: for.end:
// CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1)
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
//
//
Expand All @@ -78,7 +78,7 @@ int test_amdgcn_target_tid_threads_simd() {
// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
// CHECK-NEXT: store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2, i1 false)
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23_kernel_environment to ptr))
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
Expand Down Expand Up @@ -109,6 +109,6 @@ int test_amdgcn_target_tid_threads_simd() {
// CHECK-NEXT: ret void
// CHECK: omp.inner.for.end:
// CHECK-NEXT: store i32 1000, ptr [[I_ASCAST]], align 4
// CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
//
18 changes: 9 additions & 9 deletions clang/test/OpenMP/amdgcn_target_device_vla.cpp
Expand Up @@ -111,7 +111,7 @@ int main() {
// CHECK-NEXT: [[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12_kernel_environment to ptr))
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
Expand Down Expand Up @@ -169,7 +169,7 @@ int main() {
// CHECK-NEXT: br label [[FOR_COND2]], !llvm.loop [[LOOP15:![0-9]+]]
// CHECK: for.end9:
// CHECK-NEXT: call void @__kmpc_free_shared(ptr [[A]], i64 [[TMP7]])
// CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1)
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
//
//
Expand All @@ -193,18 +193,18 @@ int main() {
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2, i1 false)
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_kernel_environment to ptr))
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[TMP4]], ptr [[M_CASTED_ASCAST]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8
// CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR5:[0-9]+]]
// CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
// CHECK: worker.exit:
// CHECK-NEXT: ret void
Expand Down Expand Up @@ -559,7 +559,7 @@ int main() {
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1, i1 true)
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_kernel_environment to ptr))
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
Expand All @@ -570,7 +570,7 @@ int main() {
// CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR5]]
// CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1)
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
// CHECK: worker.exit:
// CHECK-NEXT: ret void
Expand Down Expand Up @@ -918,7 +918,7 @@ int main() {
// CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1, i1 true)
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_kernel_environment to ptr))
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
Expand All @@ -932,7 +932,7 @@ int main() {
// CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR5]]
// CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1)
// CHECK-NEXT: call void @__kmpc_target_deinit()
// CHECK-NEXT: ret void
// CHECK: worker.exit:
// CHECK-NEXT: ret void
Expand Down
6 changes: 3 additions & 3 deletions clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
Expand Up @@ -33,19 +33,19 @@ void write_to_aligned_array(int *a, int N) {
// CHECK-AMD-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
// CHECK-AMD-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: store ptr [[APTR]], ptr [[APTR_ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
// CHECK-AMD-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_kernel_environment to ptr))
// CHECK-AMD-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-AMD-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-AMD: user_code.entry:
// CHECK-AMD-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
// CHECK-AMD-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
// CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4
// CHECK-AMD-NEXT: store i32 [[TMP2]], ptr [[N_CASTED_ASCAST]], align 4
// CHECK-AMD-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8
// CHECK-AMD-NEXT: [[TMP4:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8
// CHECK-AMD-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
// CHECK-AMD-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
// CHECK-AMD-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
// CHECK-AMD-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
// CHECK-AMD-NEXT: call void @__kmpc_target_deinit()
// CHECK-AMD-NEXT: ret void
// CHECK-AMD: worker.exit:
// CHECK-AMD-NEXT: ret void
Expand Down
6 changes: 3 additions & 3 deletions clang/test/OpenMP/declare_target_codegen_globalization.cpp
Expand Up @@ -31,15 +31,15 @@ int maini1() {
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6maini1v_l16_kernel_environment)
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 8
// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6maini1v_l16_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
// CHECK1-NEXT: call void @__kmpc_target_deinit()
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
// CHECK1-NEXT: ret void
Expand Down

0 comments on commit c5c8040

Please sign in to comment.