353 changes: 24 additions & 329 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Large diffs are not rendered by default.

38 changes: 6 additions & 32 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,7 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
llvm::SmallVector<llvm::Function *, 16> Work;

struct EntryFunctionState {
llvm::BasicBlock *ExitBB = nullptr;
};

class WorkerFunctionState {
public:
llvm::Function *WorkerFn;
const CGFunctionInfo &CGFI;
SourceLocation Loc;

WorkerFunctionState(CodeGenModule &CGM, SourceLocation Loc);

private:
void createWorkerFunction(CodeGenModule &CGM);
};

ExecutionMode getExecutionMode() const;
Expand All @@ -60,20 +48,13 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Get barrier to synchronize all threads in a block.
void syncCTAThreads(CodeGenFunction &CGF);

/// Emit the worker function for the current target region.
void emitWorkerFunction(WorkerFunctionState &WST);
/// Helper for target directive initialization.
void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST,
bool IsSPMD);

/// Helper for worker function. Emit body of worker loop.
void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);

/// Helper for non-SPMD target entry function. Guide the master and
/// worker threads to their respective locations.
void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
WorkerFunctionState &WST);

/// Signal termination of OMP execution for non-SPMD target entry
/// function.
void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
/// Helper for target directive finalization.
void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST,
bool IsSPMD);

/// Helper for generic variables globalization prolog.
void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc,
Expand All @@ -82,13 +63,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Helper for generic variables globalization epilog.
void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false);

/// Helper for SPMD mode target directive's entry function.
void emitSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
const OMPExecutableDirective &D);

/// Signal termination of SPMD mode execution.
void emitSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);

//
// Base class overrides.
//
Expand Down
7 changes: 2 additions & 5 deletions clang/test/OpenMP/amdgcn_target_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@ int test_amdgcn_target_tid_threads() {

int arr[N];

// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads()
// CHECK: sub nuw i32 [[NUM_THREADS]], 64
// CHECK: call i32 @llvm.amdgcn.workitem.id.x()
// CHECK: call i32 @__kmpc_target_init(%struct.ident_t* addrspacecast (%struct.ident_t addrspace(1)* @1 to %struct.ident_t*), i1 false, i1 true, i1 true)
#pragma omp target
for (int i = 0; i < N; i++) {
arr[i] = 1;
Expand All @@ -29,8 +27,7 @@ int test_amdgcn_target_tid_threads_simd() {

int arr[N];

// CHECK: [[NUM_THREADS:%.+]] = call i32 @__kmpc_amdgcn_gpu_num_threads()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[NUM_THREADS]], i16 0)
// CHECK: call i32 @__kmpc_target_init(%struct.ident_t* addrspacecast (%struct.ident_t addrspace(1)* @1 to %struct.ident_t*), i1 true, i1 false, i1 false)
#pragma omp target simd
for (int i = 0; i < N; i++) {
arr[i] = 1;
Expand Down
34 changes: 7 additions & 27 deletions clang/test/OpenMP/assumes_include_nvptx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,39 +11,19 @@

// TODO: Think about teaching the OMPIRBuilder about default attributes as well so the __kmpc* declarations are annotated.

// CHECK: define internal void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}_worker() [[attr0:#[0-9]*]]
// CHECK: define weak void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}() [[attr0]]
// CHECK: %call = call float @_Z3sinf(float 0.000000e+00) [[attr5:#[0-9]*]]
// CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() [[attr1:#[0-9]*]]
// CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() [[attr1]]
// CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() [[attr1]]
// CHECK: declare void @__kmpc_kernel_init(i32, i16)
// CHECK-NOT: #
// CHECK: declare float @_Z3sinf(float) [[attr2:#[0-9]*]]
// CHECK: declare void @__kmpc_kernel_deinit(i16)
// CHECK-NOT: #
// CHECK: declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) [[attr3:#[0-9]*]]
// CHECK: declare i1 @__kmpc_kernel_parallel(i8**)
// CHECK-NOT: #
// CHECK: declare i32 @__kmpc_global_thread_num(%struct.ident_t*) [[attr4:#[0-9]*]]
// CHECK: declare void @__kmpc_kernel_end_parallel()
// CHECK-NOT: #
// CHECK: define internal void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}_worker() [[attr0]]
// CHECK: define weak void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}() [[attr0:#[0-9]]]
// CHECK: call i32 @__kmpc_target_init(
// CHECK: declare float @_Z3sinf(float) [[attr1:#[0-9]*]]
// CHECK: declare void @__kmpc_target_deinit(
// CHECK: define weak void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}() [[attr0]]
// CHECK: %call = call double @_Z3sind(double 0.000000e+00) [[attr5]]
// CHECK: declare double @_Z3sind(double) [[attr2]]
// CHECK: %call = call double @_Z3sind(double 0.000000e+00) [[attr2:#[0-9]]]
// CHECK: declare double @_Z3sind(double) [[attr1]]

// CHECK: attributes [[attr0]]
// CHECK-NOT: "llvm.assume"
// CHECK: attributes [[attr1]]
// CHECK-NOT: "llvm.assume"
// CHECK: attributes [[attr2]]
// CHECK-SAME: "llvm.assume"="check_that_this_is_attached_to_included_functions_and_template_instantiations"
// CHECK: attributes [[attr3]]
// CHECK-NOT: "llvm.assume"
// CHECK: attributes [[attr4]]
// CHECK-NOT: "llvm.assume"
// CHECK: attributes [[attr5]]
// CHECK: attributes [[attr2]]
// CHECK-SAME: "llvm.assume"="check_that_this_is_attached_to_included_functions_and_template_instantiations"


Expand Down
38 changes: 18 additions & 20 deletions clang/test/OpenMP/declare_target_codegen_globalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,19 @@ int maini1() {
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to i8*
// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP4]], i64 1)
// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]]
// CHECK1: .omp.deinit:
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .exit:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true)
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8*
// CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
// CHECK1-NEXT: ret void
//
//
Expand All @@ -60,15 +58,15 @@ int maini1() {
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]]
// CHECK1-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]]
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR3:[0-9]+]]
// CHECK1-NEXT: [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR3]]
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@_Z3fooRi
// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK1-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
Expand All @@ -78,11 +76,11 @@ int maini1() {
//
//
// CHECK1-LABEL: define {{[^@]+}}@_Z3barv
// CHECK1-SAME: () #[[ATTR2]] {
// CHECK1-SAME: () #[[ATTR1]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]]
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR3]]
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A]])
// CHECK1-NEXT: ret i32 [[CALL]]
//
126 changes: 63 additions & 63 deletions clang/test/OpenMP/nvptx_SPMD_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,28 @@ int a;
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1

void foo() {
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams distribute parallel for simd if(a)
Expand All @@ -67,28 +67,28 @@ void foo() {
for (int i = 0; i < 10; ++i)
;
int a;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams distribute parallel for lastprivate(a)
Expand All @@ -112,25 +112,25 @@ int a;
#pragma omp target teams distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_kernel_init(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init(
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams
Expand Down Expand Up @@ -172,28 +172,28 @@ int a;
#pragma omp distribute parallel for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams
Expand Down Expand Up @@ -224,28 +224,28 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target
Expand Down Expand Up @@ -283,22 +283,22 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
#pragma omp target parallel for if(a)
for (int i = 0; i < 10; ++i)
Expand All @@ -321,28 +321,28 @@ int a;
#pragma omp target parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
#pragma omp target parallel if(a)
Expand Down Expand Up @@ -373,27 +373,27 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
#pragma omp target
Expand Down Expand Up @@ -431,22 +431,22 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK-DAG: [[FULL]]
#pragma omp target
#pragma omp parallel for
Expand Down
130 changes: 29 additions & 101 deletions clang/test/OpenMP/nvptx_data_sharing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,119 +387,47 @@ void test_ds(){
// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK2-NEXT: ret void
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_worker
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK: .await.work:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK: .select.workers:
// CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK: .execute.parallel:
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
// CHECK-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
// CHECK: .execute.fn:
// CHECK-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK: .check.next:
// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
// CHECK-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]]
// CHECK: .execute.fn2:
// CHECK-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]]
// CHECK: .check.next3:
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]])
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]]
// CHECK: .terminate.parallel:
// CHECK-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK: .barrier.parallel:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK: .exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14
// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
// CHECK-NEXT: [[C:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [2 x i8*], align 8
// CHECK-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK: .worker:
// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_worker() #[[ATTR3]]
// CHECK-NEXT: br label [[DOTEXIT:%.*]]
// CHECK: .mastercheck:
// CHECK-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK: .master:
// CHECK-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [2 x i8*], align 8
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
// CHECK-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
// CHECK-NEXT: [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK-NEXT: store i32 10, i32* [[A_ON_STACK]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP8]], i64 1)
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP4]], i64 1)
// CHECK-NEXT: store i32 100, i32* [[B_ON_STACK]], align 4
// CHECK-NEXT: store i32 1000, i32* [[C]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 0
// CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[B_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8
// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 1
// CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8
// CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP13]], i64 2)
// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS1]], i64 0, i64 0
// CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 8
// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS1]], i64 0, i64 1
// CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8
// CHECK-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i64 2)
// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[B]])
// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[A]])
// CHECK-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK: .termination.notifier:
// CHECK-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTEXIT]]
// CHECK: .exit:
// CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-NEXT: ret void
// CHECK: worker.exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
Expand All @@ -513,7 +441,7 @@ void test_ds(){
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
Expand All @@ -527,12 +455,12 @@ void test_ds(){
// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
// CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR1:[0-9]+]]
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
Expand All @@ -554,7 +482,7 @@ void test_ds(){
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
Expand All @@ -571,6 +499,6 @@ void test_ds(){
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
// CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR1]]
// CHECK-NEXT: ret void
//
114 changes: 54 additions & 60 deletions clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp

Large diffs are not rendered by default.

126 changes: 63 additions & 63 deletions clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1

void foo() {
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < 10; ++i)
;
Expand All @@ -40,13 +40,13 @@ void foo() {
for (int i = 0; i < 10; ++i)
;
int a;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target teams distribute parallel for lastprivate(a)
for (int i = 0; i < 10; ++i)
a = i;
Expand All @@ -68,13 +68,13 @@ int a;
#pragma omp target teams distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target teams
#pragma omp distribute parallel for simd
for (int i = 0; i < 10; ++i)
Expand Down Expand Up @@ -103,13 +103,13 @@ int a;
#pragma omp distribute parallel for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target teams
#pragma omp distribute parallel for
for (int i = 0; i < 10; ++i)
Expand Down Expand Up @@ -138,13 +138,13 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target
#pragma omp teams
#pragma omp distribute parallel for
Expand Down Expand Up @@ -180,13 +180,13 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target parallel for
for (int i = 0; i < 10; ++i)
;
Expand All @@ -208,13 +208,13 @@ int a;
#pragma omp target parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target parallel
#pragma omp for simd
for (int i = 0; i < 10; ++i)
Expand Down Expand Up @@ -243,13 +243,13 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target
#pragma omp parallel
#pragma omp for simd ordered
Expand Down Expand Up @@ -285,13 +285,13 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
#pragma omp target
#pragma omp parallel for
for (int i = 0; i < 10; ++i)
Expand Down
1,176 changes: 405 additions & 771 deletions clang/test/OpenMP/nvptx_lambda_capturing.cpp

Large diffs are not rendered by default.

318 changes: 69 additions & 249 deletions clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp

Large diffs are not rendered by default.

363 changes: 84 additions & 279 deletions clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp

Large diffs are not rendered by default.

710 changes: 146 additions & 564 deletions clang/test/OpenMP/nvptx_parallel_codegen.cpp

Large diffs are not rendered by default.

117 changes: 26 additions & 91 deletions clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,50 +455,8 @@ int bar(int n){
// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK2-NEXT: ret void
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK: .await.work:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK: .select.workers:
// CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK: .execute.parallel:
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
// CHECK-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
// CHECK: .execute.fn:
// CHECK-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK: .check.next:
// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]])
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]]
// CHECK: .terminate.parallel:
// CHECK-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK: .barrier.parallel:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK: .exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13
// CHECK-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
Expand All @@ -507,59 +465,36 @@ int bar(int n){
// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
// CHECK-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK: .worker:
// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker() #[[ATTR3]]
// CHECK-NEXT: br label [[DOTEXIT:%.*]]
// CHECK: .mastercheck:
// CHECK-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK: .master:
// CHECK-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK: user_code.entry:
// CHECK-NEXT: [[D:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D]] to i32*
// CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK-NEXT: store i32 [[TMP7]], i32* [[D_ON_STACK]], align 4
// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
// CHECK-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8
// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
// CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[D_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8
// CHECK-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP12]], i64 2)
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK-NEXT: store i32 [[TMP3]], i32* [[D_ON_STACK]], align 4
// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK-NEXT: [[TMP5:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
// CHECK-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[D_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP8]], i64 2)
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 3
// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1
// CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[D]])
// CHECK-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK: .termination.notifier:
// CHECK-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTEXIT]]
// CHECK: .exit:
// CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-NEXT: ret void
// CHECK: worker.exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
Expand All @@ -584,7 +519,7 @@ int bar(int n){
// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: br label [[OMP_DISPATCH_COND:%.*]]
// CHECK: omp.dispatch.cond:
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
Expand Down Expand Up @@ -644,12 +579,12 @@ int bar(int n){
// CHECK-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: br label [[OMP_DISPATCH_COND]]
// CHECK: omp.dispatch.end:
// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]])
// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
Expand All @@ -666,6 +601,6 @@ int bar(int n){
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
// CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR1:[0-9]+]]
// CHECK-NEXT: ret void
//
2,150 changes: 463 additions & 1,687 deletions clang/test/OpenMP/nvptx_target_codegen.cpp

Large diffs are not rendered by default.

6 changes: 0 additions & 6 deletions clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,3 @@ int bar(int n, double *ptr) {
// TCHECK: ret void

#endif

// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
// TCHECK-DAG: distinct !DISubprogram(linkageName: "__omp_offloading_{{.+}}_worker",
408 changes: 192 additions & 216 deletions clang/test/OpenMP/nvptx_target_parallel_codegen.cpp

Large diffs are not rendered by default.

420 changes: 198 additions & 222 deletions clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp

Large diffs are not rendered by default.

49 changes: 0 additions & 49 deletions clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,54 +53,5 @@ int bar(int n){
return a;
}

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: br label {{%?}}[[EXEC:.+]]
//
// CHECK: [[EXEC]]
// CHECK-NOT: call void @__kmpc_push_proc_bind
// CHECK: {{call|invoke}} void [[OP1:@.+]](
// CHECK: br label {{%?}}[[DONE:.+]]
//
// CHECK: [[DONE]]
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: }

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: br label {{%?}}[[EXEC:.+]]
//
// CHECK: [[EXEC]]
// CHECK-NOT: call void @__kmpc_push_proc_bind
// CHECK: {{call|invoke}} void [[OP1:@.+]](
// CHECK: br label {{%?}}[[DONE:.+]]
//
// CHECK: [[DONE]]
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: }

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: br label {{%?}}[[EXEC:.+]]
//
// CHECK: [[EXEC]]
// CHECK-NOT: call void @__kmpc_push_proc_bind
// CHECK: {{call|invoke}} void [[OP1:@.+]](
// CHECK: br label {{%?}}[[DONE:.+]]
//
// CHECK: [[DONE]]
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: }
#endif
24 changes: 6 additions & 18 deletions clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,8 @@ int bar(int n){

// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
//
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: br label {{%?}}[[EXECUTE:.+]]
//
// CHECK: [[EXECUTE]]
// CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true)
//
//
// define internal void [[PFN]](
Expand Down Expand Up @@ -237,12 +233,8 @@ int bar(int n){

// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
//
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: br label {{%?}}[[EXECUTE:.+]]
//
// CHECK: [[EXECUTE]]
// CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true)
//
//
// define internal void [[PFN1]](
Expand Down Expand Up @@ -498,12 +490,8 @@ int bar(int n){

// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
//
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: br label {{%?}}[[EXECUTE:.+]]
//
// CHECK: [[EXECUTE]]
// CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true)
//
//
// define internal void [[PFN2]](
Expand Down
1,260 changes: 435 additions & 825 deletions clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp

Large diffs are not rendered by default.

211 changes: 137 additions & 74 deletions clang/test/OpenMP/nvptx_target_printf_codegen.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
Expand All @@ -6,77 +7,24 @@
// expected-no-diagnostics
extern int printf(const char *, ...);

// CHECK-DAG: private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds

// Check a simple call to printf end-to-end.
// CHECK-DAG: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double }
// CHECK-NOT: private unnamed_addr constant %struct.ident_t { i32 0, i32 2, {{1|2|3}}
int CheckSimple() {
// CHECK: define {{.*}}void [[T1:@__omp_offloading_.+CheckSimple.+]]_worker()
#pragma omp target
{
// Entry point.
// CHECK: define {{.*}}void [[T1]]()
// Alloca in entry block.
// CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]]

// CHECK: {{call|invoke}} void [[T1]]_worker()
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
//
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]

// printf in master-only basic block.
// CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt
const char* fmt = "%d %lld %f";
// CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0
// CHECK: store i32 1, i32* [[PTR0]], align 4
// CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1
// CHECK: store i64 2, i64* [[PTR1]], align 8
// CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2

// CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8
// CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8*
// CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]])
printf(fmt, 1, 2ll, 3.0);
}

return 0;
}

void CheckNoArgs() {
// CHECK: define {{.*}}void [[T2:@__omp_offloading_.+CheckNoArgs.+]]_worker()
#pragma omp target
{
// Entry point.
// CHECK: define {{.*}}void [[T2]]()

// CHECK: {{call|invoke}} void [[T2]]_worker()
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
//
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]

// printf in master-only basic block.
// CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
printf("hello, world!");
}
}
Expand All @@ -85,31 +33,146 @@ void CheckNoArgs() {
// statement.
int foo;
void CheckAllocaIsInEntryBlock() {
// CHECK: define {{.*}}void [[T3:@__omp_offloading_.+CheckAllocaIsInEntryBlock.+]]_worker()
#pragma omp target
{
// Entry point.
// CHECK: define {{.*}}void [[T3]](
// Alloca in entry block.
// CHECK: alloca %printf_args

// CHECK: {{call|invoke}} void [[T3]]_worker()
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
//
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]

if (foo) {
printf("%d", 42);
}
}
}
//
//
//
// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
// CHECK-64-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-64-NEXT: entry:
// CHECK-64-NEXT: [[FMT:%.*]] = alloca i8*, align 8
// CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-64: user_code.entry:
// CHECK-64-NEXT: store i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i8** [[FMT]], align 8
// CHECK-64-NEXT: [[TMP1:%.*]] = load i8*, i8** [[FMT]], align 8
// CHECK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 0
// CHECK-64-NEXT: store i32 1, i32* [[TMP2]], align 4
// CHECK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 1
// CHECK-64-NEXT: store i64 2, i64* [[TMP3]], align 8
// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 2
// CHECK-64-NEXT: store double 3.000000e+00, double* [[TMP4]], align 8
// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast %printf_args* [[TMP]] to i8*
// CHECK-64-NEXT: [[TMP6:%.*]] = call i32 @vprintf(i8* [[TMP1]], i8* [[TMP5]])
// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-64-NEXT: ret void
// CHECK-64: worker.exit:
// CHECK-64-NEXT: ret void
//
//
// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
// CHECK-64-SAME: () #[[ATTR0]] {
// CHECK-64-NEXT: entry:
// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-64: user_code.entry:
// CHECK-64-NEXT: [[TMP1:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i64 0, i64 0), i8* null)
// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-64-NEXT: ret void
// CHECK-64: worker.exit:
// CHECK-64-NEXT: ret void
//
//
// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
// CHECK-64-SAME: (i64 [[FOO:%.*]]) #[[ATTR0]] {
// CHECK-64-NEXT: entry:
// CHECK-64-NEXT: [[FOO_ADDR:%.*]] = alloca i64, align 8
// CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
// CHECK-64-NEXT: store i64 [[FOO]], i64* [[FOO_ADDR]], align 8
// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[FOO_ADDR]] to i32*
// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-64: user_code.entry:
// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK-64-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
// CHECK-64-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
// CHECK-64: if.then:
// CHECK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], %printf_args.0* [[TMP]], i32 0, i32 0
// CHECK-64-NEXT: store i32 42, i32* [[TMP2]], align 4
// CHECK-64-NEXT: [[TMP3:%.*]] = bitcast %printf_args.0* [[TMP]] to i8*
// CHECK-64-NEXT: [[TMP4:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str2, i64 0, i64 0), i8* [[TMP3]])
// CHECK-64-NEXT: br label [[IF_END]]
// CHECK-64: worker.exit:
// CHECK-64-NEXT: ret void
// CHECK-64: if.end:
// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-64-NEXT: ret void
//
//
//
//
//
// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
// CHECK-32-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-32-NEXT: entry:
// CHECK-32-NEXT: [[FMT:%.*]] = alloca i8*, align 4
// CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true)
// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-32: user_code.entry:
// CHECK-32-NEXT: store i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i8** [[FMT]], align 4
// CHECK-32-NEXT: [[TMP1:%.*]] = load i8*, i8** [[FMT]], align 4
// CHECK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 0
// CHECK-32-NEXT: store i32 1, i32* [[TMP2]], align 4
// CHECK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 1
// CHECK-32-NEXT: store i64 2, i64* [[TMP3]], align 8
// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], %printf_args* [[TMP]], i32 0, i32 2
// CHECK-32-NEXT: store double 3.000000e+00, double* [[TMP4]], align 8
// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast %printf_args* [[TMP]] to i8*
// CHECK-32-NEXT: [[TMP6:%.*]] = call i32 @vprintf(i8* [[TMP1]], i8* [[TMP5]])
// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-32-NEXT: ret void
// CHECK-32: worker.exit:
// CHECK-32-NEXT: ret void
//
//
// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
// CHECK-32-SAME: () #[[ATTR0]] {
// CHECK-32-NEXT: entry:
// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-32: user_code.entry:
// CHECK-32-NEXT: [[TMP1:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i32 0, i32 0), i8* null)
// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-32-NEXT: ret void
// CHECK-32: worker.exit:
// CHECK-32-NEXT: ret void
//
//
// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
// CHECK-32-SAME: (i32 [[FOO:%.*]]) #[[ATTR0]] {
// CHECK-32-NEXT: entry:
// CHECK-32-NEXT: [[FOO_ADDR:%.*]] = alloca i32, align 4
// CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
// CHECK-32-NEXT: store i32 [[FOO]], i32* [[FOO_ADDR]], align 4
// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true)
// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK-32: user_code.entry:
// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[FOO_ADDR]], align 4
// CHECK-32-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
// CHECK-32-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
// CHECK-32: if.then:
// CHECK-32-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], %printf_args.0* [[TMP]], i32 0, i32 0
// CHECK-32-NEXT: store i32 42, i32* [[TMP2]], align 4
// CHECK-32-NEXT: [[TMP3:%.*]] = bitcast %printf_args.0* [[TMP]] to i8*
// CHECK-32-NEXT: [[TMP4:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str2, i32 0, i32 0), i8* [[TMP3]])
// CHECK-32-NEXT: br label [[IF_END]]
// CHECK-32: worker.exit:
// CHECK-32-NEXT: ret void
// CHECK-32: if.end:
// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
// CHECK-32-NEXT: ret void
//
16 changes: 8 additions & 8 deletions clang/test/OpenMP/nvptx_target_simd_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,32 +61,32 @@ int bar(int n){
}

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l32}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)
// CHECK: ret void

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l37}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)
// CHECK: ret void

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l42}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)
// CHECK: ret void

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l47}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)
// CHECK: ret void

#endif
648 changes: 147 additions & 501 deletions clang/test/OpenMP/nvptx_target_teams_codegen.cpp

Large diffs are not rendered by default.

303 changes: 54 additions & 249 deletions clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp

Large diffs are not rendered by default.

920 changes: 436 additions & 484 deletions clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,33 +70,33 @@ int bar(int n){
}

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l37(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)

// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)

// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void

// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l48(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)

// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void

// CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l53({{.+}}, i{{32|64}} [[F_IN:%.+]])
// CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false)
// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false)

// CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]],
Expand Down
736 changes: 136 additions & 600 deletions clang/test/OpenMP/nvptx_teams_codegen.cpp

Large diffs are not rendered by default.

832 changes: 239 additions & 593 deletions clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out

// host-no-diagnostics
// Will be renabled with D101977
// XFAIL: *

void bar1(void) {
#pragma omp parallel // #0
Expand Down
2 changes: 2 additions & 0 deletions clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out

// host-no-diagnostics
// Will be renabled with D101977
// XFAIL: *

void bar(void) {
#pragma omp parallel // #1 \
Expand Down
900 changes: 447 additions & 453 deletions clang/test/OpenMP/target_parallel_debug_codegen.cpp

Large diffs are not rendered by default.

1,240 changes: 617 additions & 623 deletions clang/test/OpenMP/target_parallel_for_debug_codegen.cpp

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,29 @@ class OpenMPIRBuilder {
llvm::ConstantInt *Size,
const llvm::Twine &Name = Twine(""));

/// The `omp target` interface
///
/// For more information about the usage of this interface,
/// \see openmp/libomptarget/deviceRTLs/common/include/target.h
///
///{

/// Create a runtime call for kmpc_target_init
///
/// \param Loc The insert and source location description.
/// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
/// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);

/// Create a runtime call for kmpc_target_deinit
///
/// \param Loc The insert and source location description.
/// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
/// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);

///}

/// Declarations for LLVM-IR types (simple, array, function and structure) are
/// generated below. Their names are defined and used in OpenMPKinds.def. Here
/// we provide the declarations, the initializeTypes function will provide the
Expand Down
6 changes: 2 additions & 4 deletions llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
Original file line number Diff line number Diff line change
Expand Up @@ -409,10 +409,8 @@ __OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,
/* Int */ Int32, /* kmp_task_t */ VoidPtr)

/// OpenMP Device runtime functions
__OMP_RTL(__kmpc_kernel_init, false, Void, Int32, Int16)
__OMP_RTL(__kmpc_kernel_deinit, false, Void, Int16)
__OMP_RTL(__kmpc_spmd_kernel_init, false, Void, Int32, Int16)
__OMP_RTL(__kmpc_spmd_kernel_deinit_v2, false, Void, Int16)
__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int1, Int1, Int1)
__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int1, Int1)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
Expand Down
65 changes: 65 additions & 0 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Error.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
Expand Down Expand Up @@ -2191,6 +2192,70 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
return Builder.CreateCall(Fn, Args);
}

OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime) {
if (!updateToLocation(Loc))
return Loc.IP;

Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
Value *Ident = getOrCreateIdent(SrcLocStr);
ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD);
ConstantInt *UseGenericStateMachine =
ConstantInt::getBool(Int32->getContext(), !IsSPMD);
ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);

Function *Fn = getOrCreateRuntimeFunctionPtr(
omp::RuntimeFunction::OMPRTL___kmpc_target_init);

CallInst *ThreadKind =
Builder.CreateCall(Fn, {Ident, IsSPMDVal, UseGenericStateMachine, RequiresFullRuntimeVal});

Value *ExecUserCode = Builder.CreateICmpEQ(
ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), "exec_user_code");

// ThreadKind = __kmpc_target_init(...)
// if (ThreadKind == -1)
// user_code
// else
// return;

auto *UI = Builder.CreateUnreachable();
BasicBlock *CheckBB = UI->getParent();
BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");

BasicBlock *WorkerExitBB = BasicBlock::Create(
CheckBB->getContext(), "worker.exit", CheckBB->getParent());
Builder.SetInsertPoint(WorkerExitBB);
Builder.CreateRetVoid();

auto *CheckBBTI = CheckBB->getTerminator();
Builder.SetInsertPoint(CheckBBTI);
Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);

CheckBBTI->eraseFromParent();
UI->eraseFromParent();

// Continue in the "user_code" block, see diagram above and in
// openmp/libomptarget/deviceRTLs/common/include/target.h .
return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
}

void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
bool IsSPMD, bool RequiresFullRuntime) {
if (!updateToLocation(Loc))
return;

Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
Value *Ident = getOrCreateIdent(SrcLocStr);
ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD);
ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime);

Function *Fn = getOrCreateRuntimeFunctionPtr(
omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);

Builder.CreateCall(Fn, {Ident, IsSPMDVal, RequiresFullRuntimeVal});
}

std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
StringRef FirstSeparator,
StringRef Separator) {
Expand Down
16 changes: 14 additions & 2 deletions llvm/lib/Transforms/IPO/Attributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,8 +626,20 @@ void IRPosition::verify() {
Optional<Constant *>
Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
bool &UsedAssumedInformation) {
const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>(
AA, IRPosition::value(V, AA.getCallBaseContext()), DepClassTy::NONE);
// First check all callbacks provided by outside AAs. If any of them returns
// a non-null value that is different from the associated value, or None, we
// assume it's simpliied.
IRPosition IRP = IRPosition::value(V, AA.getCallBaseContext());
for (auto &CB : SimplificationCallbacks[IRP]) {
Optional<Value *> SimplifiedV = CB(IRP, &AA, UsedAssumedInformation);
if (!SimplifiedV.hasValue())
return llvm::None;
if (*SimplifiedV && *SimplifiedV != &IRP.getAssociatedValue() &&
isa<Constant>(*SimplifiedV))
return cast<Constant>(*SimplifiedV);
}
const auto &ValueSimplifyAA =
getAAFor<AAValueSimplify>(AA, IRP, DepClassTy::NONE);
Optional<Value *> SimplifiedV =
ValueSimplifyAA.getAssumedSimplifiedValue(*this);
bool IsKnown = ValueSimplifyAA.isAtFixpoint();
Expand Down
Loading