86 changes: 83 additions & 3 deletions clang/lib/Sema/SemaOpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4478,6 +4478,8 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
Params);
break;
}
// For 'target teams loop', collect all captured regions so codegen can
// later decide the best IR to emit given the associated loop-nest.
case OMPD_target_teams_loop:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd: {
Expand Down Expand Up @@ -6135,6 +6137,79 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack,
}
}

namespace {
/// A 'teams loop' with a nested 'loop bind(parallel)' or generic function
/// call in the associated loop-nest cannot be a 'parallel for'.
class TeamsLoopChecker final : public ConstStmtVisitor<TeamsLoopChecker> {
Sema &SemaRef;

public:
bool teamsLoopCanBeParallelFor() const { return TeamsLoopCanBeParallelFor; }

// Is there a nested OpenMP loop bind(parallel)
void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
if (D->getDirectiveKind() == llvm::omp::Directive::OMPD_loop) {
if (const auto *C = D->getSingleClause<OMPBindClause>())
if (C->getBindKind() == OMPC_BIND_parallel) {
TeamsLoopCanBeParallelFor = false;
// No need to continue visiting any more
return;
}
}
for (const Stmt *Child : D->children())
if (Child)
Visit(Child);
}

void VisitCallExpr(const CallExpr *C) {
// Function calls inhibit parallel loop translation of 'target teams loop'
// unless the assume-no-nested-parallelism flag has been specified.
// OpenMP API runtime library calls do not inhibit parallel loop
// translation, regardless of the assume-no-nested-parallelism.
if (C) {
bool IsOpenMPAPI = false;
auto *FD = dyn_cast_or_null<FunctionDecl>(C->getCalleeDecl());
if (FD) {
std::string Name = FD->getNameInfo().getAsString();
IsOpenMPAPI = Name.find("omp_") == 0;
}
TeamsLoopCanBeParallelFor =
IsOpenMPAPI || SemaRef.getLangOpts().OpenMPNoNestedParallelism;
if (!TeamsLoopCanBeParallelFor)
return;
}
for (const Stmt *Child : C->children())
if (Child)
Visit(Child);
}

void VisitCapturedStmt(const CapturedStmt *S) {
if (!S)
return;
Visit(S->getCapturedDecl()->getBody());
}

void VisitStmt(const Stmt *S) {
if (!S)
return;
for (const Stmt *Child : S->children())
if (Child)
Visit(Child);
}
explicit TeamsLoopChecker(Sema &SemaRef)
: SemaRef(SemaRef), TeamsLoopCanBeParallelFor(true) {}

private:
bool TeamsLoopCanBeParallelFor;
};
} // namespace

bool Sema::teamsLoopCanBeParallelFor(Stmt *AStmt) {
TeamsLoopChecker Checker(*this);
Checker.Visit(AStmt);
return Checker.teamsLoopCanBeParallelFor();
}

bool Sema::mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
ArrayRef<OMPClause *> Clauses,
OpenMPBindClauseKind &BindKind,
Expand Down Expand Up @@ -10895,7 +10970,8 @@ StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective(
setFunctionHasBranchProtectedScope();

return OMPTargetTeamsGenericLoopDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
teamsLoopCanBeParallelFor(AStmt));
}

StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective(
Expand Down Expand Up @@ -15645,14 +15721,19 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
if (NameModifier == OMPD_unknown || NameModifier == OMPD_parallel)
CaptureRegion = OMPD_target;
break;
case OMPD_teams_loop:
case OMPD_target_teams_loop:
// For [target] teams loop, assume capture region is 'teams' so it's
// available for codegen later to use if/when necessary.
CaptureRegion = OMPD_teams;
break;
case OMPD_target_teams_distribute_parallel_for_simd:
if (OpenMPVersion >= 50 &&
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd)) {
CaptureRegion = OMPD_parallel;
break;
}
[[fallthrough]];
case OMPD_target_teams_loop:
case OMPD_target_teams_distribute_parallel_for:
// If this clause applies to the nested 'parallel' region, capture within
// the 'teams' region, otherwise do not capture.
Expand Down Expand Up @@ -15775,7 +15856,6 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_loop:
case OMPD_teams_loop:
case OMPD_teams:
case OMPD_tile:
case OMPD_unroll:
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Serialization/ASTReaderStmt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2776,6 +2776,7 @@ void ASTStmtReader::VisitOMPTeamsGenericLoopDirective(
void ASTStmtReader::VisitOMPTargetTeamsGenericLoopDirective(
OMPTargetTeamsGenericLoopDirective *D) {
VisitOMPLoopDirective(D);
D->setCanBeParallelFor(Record.readBool());
}

void ASTStmtReader::VisitOMPParallelGenericLoopDirective(
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Serialization/ASTWriterStmt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2823,6 +2823,7 @@ void ASTStmtWriter::VisitOMPTeamsGenericLoopDirective(
void ASTStmtWriter::VisitOMPTargetTeamsGenericLoopDirective(
OMPTargetTeamsGenericLoopDirective *D) {
VisitOMPLoopDirective(D);
Record.writeBool(D->canBeParallelFor());
Code = serialization::STMT_OMP_TARGET_TEAMS_GENERIC_LOOP_DIRECTIVE;
}

Expand Down
48 changes: 39 additions & 9 deletions clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,35 +320,44 @@ int bar(int n){
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment, ptr [[DYN_PTR]])
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP3]] to i1
// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1
// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
// CHECK1-NEXT: call void @__kmpc_target_deinit()
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined
// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
Expand All @@ -360,6 +369,7 @@ int bar(int n){
// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
Expand Down Expand Up @@ -1566,35 +1576,44 @@ int bar(int n){
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
// CHECK2-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
// CHECK2-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
// CHECK2-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment, ptr [[DYN_PTR]])
// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK2: user_code.entry:
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
// CHECK2-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP3]] to i1
// CHECK2-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
// CHECK2-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1
// CHECK2-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
// CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
// CHECK2-NEXT: call void @__kmpc_target_deinit()
// CHECK2-NEXT: ret void
// CHECK2: worker.exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined
// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
Expand All @@ -1606,6 +1625,7 @@ int bar(int n){
// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
// CHECK2-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
// CHECK2-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
// CHECK2-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
Expand Down Expand Up @@ -2801,35 +2821,44 @@ int bar(int n){
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
// CHECK3-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
// CHECK3-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment, ptr [[DYN_PTR]])
// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK3: user_code.entry:
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
// CHECK3-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP3]] to i1
// CHECK3-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
// CHECK3-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1
// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
// CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
// CHECK3-NEXT: call void @__kmpc_target_deinit()
// CHECK3-NEXT: ret void
// CHECK3: worker.exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined
// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
Expand All @@ -2841,6 +2870,7 @@ int bar(int n){
// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
// CHECK3-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
// CHECK3-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
Expand Down

Large diffs are not rendered by default.

1,132 changes: 8 additions & 1,124 deletions clang/test/OpenMP/target_teams_generic_loop_codegen.cpp

Large diffs are not rendered by default.

587 changes: 587 additions & 0 deletions clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp

Large diffs are not rendered by default.

3,998 changes: 3,998 additions & 0 deletions clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp

Large diffs are not rendered by default.

710 changes: 114 additions & 596 deletions clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,6 @@ void gtid_test() {
// CHECK1: omp.inner.for.end:
// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
// CHECK1: omp.loop.exit:
// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP3]])
// CHECK1-NEXT: ret void
//
1,438 changes: 240 additions & 1,198 deletions clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp

Large diffs are not rendered by default.

1,360 changes: 148 additions & 1,212 deletions clang/test/OpenMP/teams_generic_loop_codegen-1.cpp

Large diffs are not rendered by default.

630 changes: 138 additions & 492 deletions clang/test/OpenMP/teams_generic_loop_codegen.cpp

Large diffs are not rendered by default.

808 changes: 136 additions & 672 deletions clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp

Large diffs are not rendered by default.

685 changes: 103 additions & 582 deletions clang/test/OpenMP/teams_generic_loop_private_codegen.cpp

Large diffs are not rendered by default.

785 changes: 109 additions & 676 deletions clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp

Large diffs are not rendered by default.