diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index f5fc7a8ce5bb3..ac1b3cdfff145 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -356,6 +356,13 @@ void getOpenMPCaptureRegions( /// \return true - if the above condition is met for this directive /// otherwise - false. bool isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind); + +/// Checks if the specified target directive, combined or not, needs task based +/// thread_limit +/// \param DKind Specified directive. +/// \return true - if the above condition is met for this directive +/// otherwise - false. +bool needsTaskBasedThreadLimit(OpenMPDirectiveKind DKind); } #endif diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index a679f2ecf0e2b..86de067da134a 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -748,6 +748,13 @@ bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) { DKind == OMPD_parallel_sections; } +bool clang::needsTaskBasedThreadLimit(OpenMPDirectiveKind DKind) { + return DKind == OMPD_target || DKind == OMPD_target_parallel || + DKind == OMPD_target_parallel_for || + DKind == OMPD_target_parallel_for_simd || DKind == OMPD_target_simd || + DKind == OMPD_target_parallel_loop; +} + void clang::getOpenMPCaptureRegions( SmallVectorImpl &CaptureRegions, OpenMPDirectiveKind DKind) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 5d947a2c0943a..253ef8b75163e 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9681,9 +9681,13 @@ void CGOpenMPRuntime::emitTargetCall( assert((OffloadingMandatory || OutlinedFn) && "Invalid outlined function!"); - const bool RequiresOuterTask = D.hasClausesOfKind() || - D.hasClausesOfKind() || - D.hasClausesOfKind(); + const bool RequiresOuterTask = + D.hasClausesOfKind() || + D.hasClausesOfKind() || + D.hasClausesOfKind() || + (CGM.getLangOpts().OpenMP >= 51 && + needsTaskBasedThreadLimit(D.getDirectiveKind()) && + D.hasClausesOfKind()); llvm::SmallVector CapturedVars; const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target); auto &&ArgsCodegen = [&CS, &CapturedVars](CodeGenFunction &CGF, @@ -10235,6 +10239,24 @@ void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF, PushNumTeamsArgs); } +void CGOpenMPRuntime::emitThreadLimitClause(CodeGenFunction &CGF, + const Expr *ThreadLimit, + SourceLocation Loc) { + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *ThreadLimitVal = + ThreadLimit + ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit), + CGF.CGM.Int32Ty, /* isSigned = */ true) + : CGF.Builder.getInt32(0); + + // Build call __kmpc_set_thread_limit(&loc, global_tid, thread_limit) + llvm::Value *ThreadLimitArgs[] = {RTLoc, getThreadID(CGF, Loc), + ThreadLimitVal}; + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_set_thread_limit), + ThreadLimitArgs); +} + void CGOpenMPRuntime::emitTargetDataCalls( CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond, const Expr *Device, const RegionCodeGenTy &CodeGen, diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index d1ad6d7f06a85..74b528d6cd7f8 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1435,6 +1435,14 @@ class CGOpenMPRuntime { virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc); + /// Emits call to void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 + /// global_tid, kmp_int32 thread_limit) to generate code for + /// thread_limit clause on target directive + /// \param ThreadLimit An integer expression of threads. + virtual void emitThreadLimitClause(CodeGenFunction &CGF, + const Expr *ThreadLimit, + SourceLocation Loc); + /// Struct that keeps all the relevant information that should be kept /// throughout a 'target data' region. class TargetDataInfo : public llvm::OpenMPIRBuilder::TargetDataInfo { diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 4910ff6865e43..6eca0a5ccab41 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -5143,6 +5143,15 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective( Action.Enter(CGF); OMPLexicalScope LexScope(CGF, S, OMPD_task, /*EmitPreInitStmt=*/false); + auto *TL = S.getSingleClause(); + if (CGF.CGM.getLangOpts().OpenMP >= 51 && + needsTaskBasedThreadLimit(S.getDirectiveKind()) && TL) { + // Emit __kmpc_set_thread_limit() to set the thread_limit for the task + // enclosing this target region. This will indirectly set the thread_limit + // for every applicable construct within target region. + CGF.CGM.getOpenMPRuntime().emitThreadLimitClause( + CGF, TL->getThreadLimit(), S.getBeginLoc()); + } BodyGen(CGF); }; llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction( diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 04aac12efe8bf..46eae3596d2a8 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -15907,6 +15907,11 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_target_teams_distribute_parallel_for: case OMPD_target_teams_distribute_parallel_for_simd: case OMPD_target_teams_loop: + case OMPD_target_simd: + case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: + case OMPD_target_parallel_loop: CaptureRegion = OMPD_target; break; case OMPD_teams_distribute_parallel_for: @@ -15942,11 +15947,6 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_parallel_for: case OMPD_parallel_for_simd: case OMPD_parallel_loop: - case OMPD_target_simd: - case OMPD_target_parallel: - case OMPD_target_parallel_for: - case OMPD_target_parallel_for_simd: - case OMPD_target_parallel_loop: case OMPD_threadprivate: case OMPD_allocate: case OMPD_taskyield: diff --git a/clang/test/OpenMP/target_codegen.cpp b/clang/test/OpenMP/target_codegen.cpp index bf56b25af11ed..bd3d7eb853dab 100644 --- a/clang/test/OpenMP/target_codegen.cpp +++ b/clang/test/OpenMP/target_codegen.cpp @@ -846,7 +846,8 @@ void thread_limit_target(int TargetTL, int TeamsTL) { // OMP51: store {{.*}} [[TL]], {{.*}} [[CEA:%.*]] // OMP51: load {{.*}} [[CEA]] // OMP51: [[CE:%.*]] = load {{.*}} [[CEA]] -// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 -1, i32 [[CE]], +// OMP51: call ptr @__kmpc_omp_task_alloc({{.*@.omp_task_entry.*}}) +// OMP51: call i32 [[OMP_TASK_ENTRY]] #pragma omp target thread_limit(TargetTL) #pragma omp teams @@ -854,8 +855,8 @@ void thread_limit_target(int TargetTL, int TeamsTL) { // OMP51: [[TL:%.*]] = load {{.*}} %TargetTL.addr // OMP51: store {{.*}} [[TL]], {{.*}} [[CEA:%.*]] // OMP51: load {{.*}} [[CEA]] -// OMP51: [[CE:%.*]] = load {{.*}} [[CEA]] -// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0, i32 [[CE]], +// OMP51: call ptr @__kmpc_omp_task_alloc({{.*@.omp_task_entry.*}}) +// OMP51: call i32 [[OMP_TASK_ENTRY]] #pragma omp target #pragma omp teams thread_limit(TeamsTL) @@ -869,10 +870,25 @@ void thread_limit_target(int TargetTL, int TeamsTL) { {} // OMP51: load {{.*}} %TeamsTL.addr // OMP51: [[TeamsL:%.*]] = load {{.*}} %TeamsTL.addr -// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0, i32 [[TeamsL]], +// OMP51: call ptr @__kmpc_omp_task_alloc({{.*@.omp_task_entry.*}}) +// OMP51: call i32 [[OMP_TASK_ENTRY]] } #endif +// Check that the offloading functions are called after setting thread_limit in the task entry functions + +// OMP51: define internal {{.*}}i32 [[OMP_TASK_ENTRY:@.+]](i32 {{.*}}%0, ptr noalias noundef %1) +// OMP51: call void @__kmpc_set_thread_limit(ptr @{{.+}}, i32 %{{.+}}, i32 %{{.+}}) +// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 -1, + +// OMP51: define internal {{.*}}i32 [[OMP_TASK_ENTRY:@.+]](i32 {{.*}}%0, ptr noalias noundef %1) +// OMP51: call void @__kmpc_set_thread_limit(ptr @{{.+}}, i32 %{{.+}}, i32 %{{.+}}) +// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0, + +// OMP51: define internal {{.*}}i32 [[OMP_TASK_ENTRY:@.+]](i32 {{.*}}%0, ptr noalias noundef %1) +// OMP51: call void @__kmpc_set_thread_limit(ptr @{{.+}}, i32 %{{.+}}, i32 %{{.+}}) +// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0, + // CHECK: define internal void @.omp_offloading.requires_reg() // CHECK: call void @__tgt_register_requires(i64 1) diff --git a/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp new file mode 100644 index 0000000000000..daeb5102b0e22 --- /dev/null +++ b/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51 + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +int thread_limit_target_parallel_for_simd() { + +// Check that the offloading function is called after setting thread_limit in the task entry function +#pragma omp target parallel for simd thread_limit(2) + for(int i=0; i<2; i++) {} + + return 0; +} + +#endif +// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_for_simd{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1 +// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]]) +// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.) +// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0 +// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]] +// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: ret i32 0 +// +// +// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]]) +// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !18 +// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !18 +// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !18 +// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !18 +// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !18 +// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !18 +// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !18 +// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !18 +// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2) +// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel_for_simd{{.*\(.*\).*}} +// OMP51-NEXT: ret i32 0 +// diff --git a/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp new file mode 100644 index 0000000000000..e6483b704586e --- /dev/null +++ b/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51 + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +int thread_limit_target_parallel_for() { + +// Check that the offloading function is called after setting thread_limit in the task entry function +#pragma omp target parallel for thread_limit(2) + for(int i=0; i<2; i++) {} + + return 0; +} + +#endif +// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_for{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1 +// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]]) +// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.) +// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0 +// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]] +// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: ret i32 0 +// +// +// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) +// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 +// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 +// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2) +// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel_for{{.*}} +// OMP51-NEXT: ret i32 0 +// diff --git a/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp new file mode 100644 index 0000000000000..32bbb546a05a3 --- /dev/null +++ b/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51 + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +int thread_limit_target_parallel_loop() { + +// Check that the offloading function is called after setting thread_limit in the task entry function +#pragma omp target parallel loop thread_limit(2) + for(int i=0; i<2; i++) {} + + return 0; +} + +#endif +// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_loop{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1 +// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]]) +// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.) +// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0 +// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]] +// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: ret i32 0 +// +// +// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) +// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 +// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 +// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2) +// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel_loop{{.*}} +// OMP51-NEXT: ret i32 0 +// diff --git a/clang/test/OpenMP/target_parallel_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_tl_codegen.cpp new file mode 100644 index 0000000000000..e1ca288bd7329 --- /dev/null +++ b/clang/test/OpenMP/target_parallel_tl_codegen.cpp @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51 + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +int thread_limit_target_parallel() { + +// Check that the offloading function is called after setting thread_limit in the task entry function +#pragma omp target parallel thread_limit(2) +{} + + return 0; +} + +#endif +// OMP51-LABEL: define {{.*}}thread_limit_target_parallel{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1 +// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.) +// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0 +// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]] +// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: ret i32 0 +// +// +// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) +// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 +// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14 +// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14 +// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB1]], i32 [[TMP9]], i32 2) +// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel{{.*}} +// OMP51-NEXT: ret i32 0 +// diff --git a/clang/test/OpenMP/target_simd_tl_codegen.cpp b/clang/test/OpenMP/target_simd_tl_codegen.cpp new file mode 100644 index 0000000000000..8d6139d055fc5 --- /dev/null +++ b/clang/test/OpenMP/target_simd_tl_codegen.cpp @@ -0,0 +1,66 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51 + +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +int thread_limit_target_simd() { + +// Check that the offloading function is called after setting thread_limit in the task entry function +#pragma omp target simd thread_limit(2) + for(int i=0; i<2; i++) {} + + return 0; +} + +#endif +// OMP51-LABEL: define {{.*}}thread_limit_target_simd{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1 +// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]]) +// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.) +// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0 +// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]] +// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]) +// OMP51-NEXT: ret i32 0 +// +// +// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{ +// OMP51-NEXT: entry: +// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META7:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) +// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]]) +// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !16 +// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !16 +// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !16 +// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !16 +// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !16 +// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !16 +// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !16 +// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !16 +// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB1]], i32 [[TMP9]], i32 2) +// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_simd{{.*}} +// OMP51-NEXT: ret i32 0 +// diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 84ed836ff236c..b6639b67a5c52 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -762,6 +762,7 @@ def OMP_TargetParallel : Directive<"target parallel"> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, ]; } def OMP_TargetParallelFor : Directive<"target parallel for"> { @@ -793,6 +794,7 @@ def OMP_TargetParallelFor : Directive<"target parallel for"> { ]; let allowedOnceClauses = [ VersionedClause, + VersionedClause, ]; } def OMP_TargetParallelDo : Directive<"target parallel do"> { @@ -1279,6 +1281,7 @@ def OMP_TargetParallelForSimd : Directive<"target parallel for simd"> { ]; let allowedOnceClauses = [ VersionedClause, + VersionedClause, ]; } def OMP_TargetParallelDoSimd : Directive<"target parallel do simd"> { @@ -1342,7 +1345,8 @@ def OMP_TargetSimd : Directive<"target simd"> { VersionedClause, VersionedClause, VersionedClause, - VersionedClause + VersionedClause, + VersionedClause, ]; } def OMP_TeamsDistribute : Directive<"teams distribute"> { @@ -2160,6 +2164,7 @@ def OMP_target_parallel_loop : Directive<"target parallel loop"> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, ]; } def OMP_Metadirective : Directive<"metadirective"> { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index aa85b3fa7f209..c4218326280b2 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -386,6 +386,7 @@ __OMP_RTL(__kmpc_cancellationpoint, false, Int32, IdentPtr, Int32, Int32) __OMP_RTL(__kmpc_fork_teams, true, Void, IdentPtr, Int32, ParallelTaskPtr) __OMP_RTL(__kmpc_push_num_teams, false, Void, IdentPtr, Int32, Int32, Int32) +__OMP_RTL(__kmpc_set_thread_limit, false, Void, IdentPtr, Int32, Int32) __OMP_RTL(__kmpc_copyprivate, false, Void, IdentPtr, Int32, SizeTy, VoidPtr, CopyFunctionPtr, Int32) @@ -913,6 +914,8 @@ __OMP_RTL_ATTRS(__kmpc_fork_teams, ForkAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, SExt, ReadOnlyPtrAttrs)) __OMP_RTL_ATTRS(__kmpc_push_num_teams, InaccessibleArgOnlyAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt, SExt)) +__OMP_RTL_ATTRS(__kmpc_set_thread_limit, InaccessibleArgOnlyAttrs, AttributeSet(), + ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt)) __OMP_RTL_ATTRS(__kmpc_copyprivate, DefaultAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, SExt, SizeTyExt, diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index a65f34ff3b86d..33895f8fbb1e3 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2111,6 +2111,7 @@ typedef struct kmp_internal_control { int nproc; /* internal control for #threads for next parallel region (per thread) */ int thread_limit; /* internal control for thread-limit-var */ + int task_thread_limit; /* internal control for thread-limit-var of a task*/ int max_active_levels; /* internal control for max_active_levels */ kmp_r_sched_t sched; /* internal control for runtime schedule {sched,chunk} pair */ @@ -3340,6 +3341,7 @@ extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */ extern int __kmp_max_nth; // maximum total number of concurrently-existing threads in a contention group extern int __kmp_cg_max_nth; +extern int __kmp_task_max_nth; // max threads used in a task extern int __kmp_teams_max_nth; // max threads used in a teams construct extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and __kmp_root */ @@ -4297,6 +4299,8 @@ KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads); +KMP_EXPORT void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid, + kmp_int32 thread_limit); /* Function for OpenMP 5.1 num_teams clause */ KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams_lb, diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index 8283691501669..8bd0e89a7dacd 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -381,6 +381,24 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, __kmp_push_num_teams(loc, global_tid, num_teams, num_threads); } +/*! +@ingroup PARALLEL +@param loc source location information +@param global_tid global thread number +@param thread_limit limit on number of threads which can be created within the +current task + +Set the thread_limit for the current task +This call is there to support `thread_limit` clause on the `target` construct +*/ +void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid, + kmp_int32 thread_limit) { + __kmp_assert_valid_gtid(global_tid); + kmp_info_t *thread = __kmp_threads[global_tid]; + if (thread_limit > 0) + thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit; +} + /*! @ingroup PARALLEL @param loc source location information diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h index d686a889972ec..ffb01a31fb93e 100644 --- a/openmp/runtime/src/kmp_ftn_entry.h +++ b/openmp/runtime/src/kmp_ftn_entry.h @@ -807,6 +807,10 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) { gtid = __kmp_entry_gtid(); thread = __kmp_threads[gtid]; + // If thread_limit for the target task is defined, return that instead of the + // regular task thread_limit + if (int thread_limit = thread->th.th_current_task->td_icvs.task_thread_limit) + return thread_limit; return thread->th.th_current_task->td_icvs.thread_limit; #endif } diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index c66ab59a01c6f..48097fb530d1c 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -125,6 +125,7 @@ size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE; int __kmp_sys_max_nth = KMP_MAX_NTH; int __kmp_max_nth = 0; int __kmp_cg_max_nth = 0; +int __kmp_task_max_nth = 0; int __kmp_teams_max_nth = 0; int __kmp_threads_capacity = 0; int __kmp_dflt_team_nth = 0; diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 34f6781e91010..c8a18e81810cb 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -1872,6 +1872,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, int nthreads; int master_active; int master_set_numthreads; + int task_thread_limit = 0; int level; int active_level; int teams_level; @@ -1910,6 +1911,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, root = master_th->th.th_root; master_active = root->r.r_active; master_set_numthreads = master_th->th.th_set_nproc; + task_thread_limit = + master_th->th.th_current_task->td_icvs.task_thread_limit; #if OMPT_SUPPORT ompt_data_t ompt_parallel_data = ompt_data_none; @@ -2000,6 +2003,11 @@ int __kmp_fork_call(ident_t *loc, int gtid, ? master_set_numthreads // TODO: get nproc directly from current task : get__nproc_2(parent_team, master_tid); + // Use the thread_limit set for the current target task if exists, else go + // with the deduced nthreads + nthreads = task_thread_limit > 0 && task_thread_limit < nthreads + ? task_thread_limit + : nthreads; // Check if we need to take forkjoin lock? (no need for serialized // parallel out of teams construct). if (nthreads > 1) { @@ -3291,6 +3299,8 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) { // next parallel region (per thread) // (use a max ub on value if __kmp_parallel_initialize not called yet) __kmp_cg_max_nth, // int thread_limit; + __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit + // on task. This is used in the case of target thread_limit __kmp_dflt_max_active_levels, // int max_active_levels; //internal control // for max_active_levels r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule diff --git a/openmp/runtime/test/target/target_thread_limit.cpp b/openmp/runtime/test/target/target_thread_limit.cpp new file mode 100644 index 0000000000000..0cc3307977e97 --- /dev/null +++ b/openmp/runtime/test/target/target_thread_limit.cpp @@ -0,0 +1,168 @@ +// RUN: %libomp-cxx-compile -fopenmp-version=51 +// RUN: %libomp-run | FileCheck %s --check-prefix OMP51 + +#include +#include + +void foo() { +#pragma omp parallel num_threads(10) + { printf("\ntarget: foo(): parallel num_threads(10)"); } +} + +int main(void) { + + int tl = 4; + printf("\nmain: thread_limit = %d", omp_get_thread_limit()); + // OMP51: main: thread_limit = {{[0-9]+}} + +#pragma omp target thread_limit(tl) + { + printf("\ntarget: thread_limit = %d", omp_get_thread_limit()); +// OMP51: target: thread_limit = 4 +// check whether thread_limit is honoured +#pragma omp parallel + { printf("\ntarget: parallel"); } +// OMP51: target: parallel +// OMP51: target: parallel +// OMP51: target: parallel +// OMP51: target: parallel +// OMP51-NOT: target: parallel + +// check whether num_threads is honoured +#pragma omp parallel num_threads(2) + { printf("\ntarget: parallel num_threads(2)"); } +// OMP51: target: parallel num_threads(2) +// OMP51: target: parallel num_threads(2) +// OMP51-NOT: target: parallel num_threads(2) + +// check whether thread_limit is honoured when there is a conflicting +// num_threads +#pragma omp parallel num_threads(10) + { printf("\ntarget: parallel num_threads(10)"); } + // OMP51: target: parallel num_threads(10) + // OMP51: target: parallel num_threads(10) + // OMP51: target: parallel num_threads(10) + // OMP51: target: parallel num_threads(10) + // OMP51-NOT: target: parallel num_threads(10) + + // check whether threads are limited across functions + foo(); + // OMP51: target: foo(): parallel num_threads(10) + // OMP51: target: foo(): parallel num_threads(10) + // OMP51: target: foo(): parallel num_threads(10) + // OMP51: target: foo(): parallel num_threads(10) + // OMP51-NOT: target: foo(): parallel num_threads(10) + + // check if user can set num_threads at runtime + omp_set_num_threads(2); +#pragma omp parallel + { printf("\ntarget: parallel with omp_set_num_thread(2)"); } + // OMP51: target: parallel with omp_set_num_thread(2) + // OMP51: target: parallel with omp_set_num_thread(2) + // OMP51-NOT: target: parallel with omp_set_num_thread(2) + + // make sure thread_limit is unaffected by omp_set_num_threads + printf("\ntarget: thread_limit = %d", omp_get_thread_limit()); + // OMP51: target: thread_limit = 4 + } + +// checking consecutive target regions with different thread_limits +#pragma omp target thread_limit(3) + { + printf("\nsecond target: thread_limit = %d", omp_get_thread_limit()); +// OMP51: second target: thread_limit = 3 +#pragma omp parallel + { printf("\nsecond target: parallel"); } + // OMP51: second target: parallel + // OMP51: second target: parallel + // OMP51: second target: parallel + // OMP51-NOT: second target: parallel + } + + // confirm that thread_limit's effects are limited to target region + printf("\nmain: thread_limit = %d", omp_get_thread_limit()); + // OMP51: main: thread_limit = {{[0-9]+}} +#pragma omp parallel num_threads(10) + { printf("\nmain: parallel num_threads(10)"); } + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51: main: parallel num_threads(10) + // OMP51-NOT: main: parallel num_threads(10) + +// check combined target directives which support thread_limit +// target parallel +#pragma omp target parallel thread_limit(2) + printf("\ntarget parallel thread_limit(2)"); + // OMP51: target parallel thread_limit(2) + // OMP51: target parallel thread_limit(2) + // OMP51-NOT: target parallel thread_limit(2) + +#pragma omp target parallel num_threads(2) thread_limit(3) + printf("\ntarget parallel num_threads(2) thread_limit(3)"); + // OMP51: target parallel num_threads(2) thread_limit(3) + // OMP51: target parallel num_threads(2) thread_limit(3) + // OMP51-NOT: target parallel num_threads(2) thread_limit(3) + +#pragma omp target parallel num_threads(3) thread_limit(2) + printf("\ntarget parallel num_threads(3) thread_limit(2)"); + // OMP51: target parallel num_threads(3) thread_limit(2) + // OMP51: target parallel num_threads(3) thread_limit(2) + // OMP51-NOT: target parallel num_threads(3) thread_limit(2) + +// target parallel for +#pragma omp target parallel for thread_limit(2) + for (int i = 0; i < 5; ++i) + printf("\ntarget parallel for thread_limit(2) : thread num = %d", + omp_get_thread_num()); + // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}} + // OMP51-NOT: target parallel for thread_limit(3) : thread num = {{0|1}} + +// target parallel for simd +#pragma omp target parallel for simd thread_limit(2) + for (int i = 0; i < 5; ++i) + printf("\ntarget parallel for simd thread_limit(2) : thread num = %d", + omp_get_thread_num()); + // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}} + // OMP51-NOT: target parallel for simd thread_limit(2) : thread num = + // {{0|1}} + +// target simd +#pragma omp target simd thread_limit(2) + for (int i = 0; i < 5; ++i) + printf("\ntarget simd thread_limit(2) : thread num = %d", + omp_get_thread_num()); + // OMP51: target simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target simd thread_limit(2) : thread num = {{0|1}} + // OMP51: target simd thread_limit(2) : thread num = {{0|1}} + // OMP51-NOT: target simd thread_limit(2) : thread num = {{0|1}} + +// target parallel loop +#pragma omp target parallel loop thread_limit(2) + for (int i = 0; i < 5; ++i) + printf("\ntarget parallel loop thread_limit(2) : thread num = %d", + omp_get_thread_num()); + // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}} + // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}} + // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}} + // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}} + // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}} + // # OMP51-NOT: target parallel loop thread_limit(2) : thread num = {{0|1}} + return 0; +}