Skip to content

Commit

Permalink
Support ‘omp for’ with static chunked schedule kind.
Browse files Browse the repository at this point in the history
Differential Revision: http://reviews.llvm.org/D7006

llvm-svn: 226795
  • Loading branch information
amusman committed Jan 22, 2015
1 parent 3f68fae commit df7a8e2
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 3 deletions.
6 changes: 6 additions & 0 deletions clang/lib/CodeGen/CGOpenMPRuntime.cpp
Expand Up @@ -848,6 +848,12 @@ bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
return Schedule == OMP_sch_static;
}

bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
auto Schedule = getRuntimeSchedule(ScheduleKind, /* Chunked */ false);
assert(Schedule != OMP_sch_static_chunked && "cannot be chunked here");
return Schedule != OMP_sch_static;
}

void CGOpenMPRuntime::EmitOMPForInit(CodeGenFunction &CGF, SourceLocation Loc,
OpenMPScheduleClauseKind ScheduleKind,
unsigned IVSize, bool IVSigned,
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/CodeGen/CGOpenMPRuntime.h
Expand Up @@ -320,6 +320,12 @@ class CGOpenMPRuntime {
virtual bool isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
bool Chunked) const;

/// \brief Check if the specified \a ScheduleKind is dynamic.
/// This kind of worksharing directive is emitted without outer loop.
/// \param ScheduleKind Schedule Kind specified in the 'schedule' clause.
///
virtual bool isDynamic(OpenMPScheduleClauseKind ScheduleKind) const;

/// \brief Call the appropriate runtime routine to initialize it before start
/// of loop.
///
Expand Down
92 changes: 90 additions & 2 deletions clang/lib/CodeGen/CGStmtOpenMP.cpp
Expand Up @@ -500,6 +500,89 @@ void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
DI->EmitLexicalBlockEnd(Builder, S.getSourceRange().getEnd());
}

void CodeGenFunction::EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind,
const OMPLoopDirective &S,
OMPPrivateScope &LoopScope,
llvm::Value *LB, llvm::Value *UB,
llvm::Value *ST, llvm::Value *IL,
llvm::Value *Chunk) {
auto &RT = CGM.getOpenMPRuntime();
assert(!RT.isStaticNonchunked(ScheduleKind, /* Chunked */ Chunk != nullptr) &&
"static non-chunked schedule does not need outer loop");
if (RT.isDynamic(ScheduleKind)) {
ErrorUnsupported(&S, "OpenMP loop with dynamic schedule");
return;
}

// Emit outer loop.
//
// OpenMP [2.7.1, Loop Construct, Description, table 2-1]
// When schedule(static, chunk_size) is specified, iterations are divided into
// chunks of size chunk_size, and the chunks are assigned to the threads in
// the team in a round-robin fashion in the order of the thread number.
//
// while(UB = min(UB, GlobalUB), idx = LB, idx < UB) {
// while (idx <= UB) { BODY; ++idx; } // inner loop
// LB = LB + ST;
// UB = UB + ST;
// }
//
const Expr *IVExpr = S.getIterationVariable();
const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();

RT.EmitOMPForInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, IL,
LB, UB, ST, Chunk);
auto LoopExit = getJumpDestInCurrentScope("omp.dispatch.end");

// Start the loop with a block that tests the condition.
auto CondBlock = createBasicBlock("omp.dispatch.cond");
EmitBlock(CondBlock);
LoopStack.push(CondBlock);

llvm::Value *BoolCondVal = nullptr;
// UB = min(UB, GlobalUB)
EmitIgnoredExpr(S.getEnsureUpperBound());
// IV = LB
EmitIgnoredExpr(S.getInit());
// IV < UB
BoolCondVal = EvaluateExprAsBool(S.getCond(false));

// If there are any cleanups between here and the loop-exit scope,
// create a block to stage a loop exit along.
auto ExitBlock = LoopExit.getBlock();
if (LoopScope.requiresCleanups())
ExitBlock = createBasicBlock("omp.dispatch.cleanup");

auto LoopBody = createBasicBlock("omp.dispatch.body");
Builder.CreateCondBr(BoolCondVal, LoopBody, ExitBlock);
if (ExitBlock != LoopExit.getBlock()) {
EmitBlock(ExitBlock);
EmitBranchThroughCleanup(LoopExit);
}
EmitBlock(LoopBody);

// Create a block for the increment.
auto Continue = getJumpDestInCurrentScope("omp.dispatch.inc");
BreakContinueStack.push_back(BreakContinue(LoopExit, Continue));

EmitOMPInnerLoop(S, LoopScope);

EmitBlock(Continue.getBlock());
BreakContinueStack.pop_back();
// Emit "LB = LB + Stride", "UB = UB + Stride".
EmitIgnoredExpr(S.getNextLowerBound());
EmitIgnoredExpr(S.getNextUpperBound());

EmitBranch(CondBlock);
LoopStack.pop();
// Emit the fall-through block.
EmitBlock(LoopExit.getBlock());

// Tell the runtime we are done.
RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind);
}

/// \brief Emit a helper variable and return corresponding lvalue.
static LValue EmitOMPHelperVar(CodeGenFunction &CGF,
const DeclRefExpr *Helper) {
Expand Down Expand Up @@ -581,8 +664,13 @@ void CodeGenFunction::EmitOMPWorksharingLoop(const OMPLoopDirective &S) {
EmitOMPInnerLoop(S, LoopScope);
// Tell the runtime we are done.
RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind);
} else
ErrorUnsupported(&S, "OpenMP loop with requested schedule");
} else {
// Emit the outer loop, which requests its work chunk [LB..UB] from
// runtime and runs the inner loop to process it.
EmitOMPForOuterLoop(ScheduleKind, S, LoopScope, LB.getAddress(),
UB.getAddress(), ST.getAddress(), IL.getAddress(),
Chunk);
}
}
// We're now done with the loop, so jump to the continuation block.
EmitBranch(ContBlock);
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/CodeGen/CodeGenFunction.h
Expand Up @@ -27,6 +27,7 @@
#include "clang/AST/Type.h"
#include "clang/Basic/ABI.h"
#include "clang/Basic/CapturedStmt.h"
#include "clang/Basic/OpenMPKinds.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Frontend/CodeGenOptions.h"
#include "llvm/ADT/ArrayRef.h"
Expand Down Expand Up @@ -2052,6 +2053,11 @@ class CodeGenFunction : public CodeGenTypeCache {
bool SeparateIter = false);
void EmitOMPSimdFinal(const OMPLoopDirective &S);
void EmitOMPWorksharingLoop(const OMPLoopDirective &S);
void EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind,
const OMPLoopDirective &S,
OMPPrivateScope &LoopScope, llvm::Value *LB,
llvm::Value *UB, llvm::Value *ST, llvm::Value *IL,
llvm::Value *Chunk);

public:

Expand Down
61 changes: 60 additions & 1 deletion clang/test/OpenMP/for_codegen.cpp
@@ -1,4 +1,4 @@
// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
//
Expand Down Expand Up @@ -87,5 +87,64 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
// CHECK: ret void
}

// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
void static_chunked(float *a, float *b, float *c, float *d) {
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
#pragma omp for schedule(static, 5)
// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
// UB = min(UB, GlobalUB)
// CHECK: [[UB:%.+]] = load i32* [[OMP_UB]]
// CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
// CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
// CHECK-NEXT: [[LB:%.+]] = load i32* [[OMP_LB]]
// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]

// Outer loop header
// CHECK: [[O_IV:%.+]] = load i32* [[OMP_IV]]
// CHECK-NEXT: [[O_UB:%.+]] = load i32* [[OMP_UB]]
// CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]]
// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]

// Loop header
// CHECK: [[O_LOOP1_BODY]]
// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]]
// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
// CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]]
// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
for (unsigned i = 131071; i <= 2147483647; i += 127) {
// CHECK: [[LOOP1_BODY]]
// Start of body: calculate i from IV:
// CHECK: [[IV1_1:%.+]] = load i32* [[OMP_IV]]
// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127
// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]]
// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
// ... loop body ...
// End of body: store into a[i]:
// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
a[i] = b[i] * c[i] * d[i];
// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.*}}
// CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1
// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
// CHECK-NEXT: br label %{{.+}}
}
// CHECK: [[LOOP1_END]]
// Update the counters, adding stride
// CHECK: [[LB:%.+]] = load i32* [[OMP_LB]]
// CHECK-NEXT: [[ST:%.+]] = load i32* [[OMP_ST]]
// CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]]
// CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]]
// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
// CHECK-NEXT: [[ST:%.+]] = load i32* [[OMP_ST]]
// CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]]
// CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]]

// CHECK: [[O_LOOP1_END]]
// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
// CHECK: ret void
}

#endif // HEADER

0 comments on commit df7a8e2

Please sign in to comment.