Support ‘omp for’ with static chunked schedule kind.

Differential Revision: http://reviews.llvm.org/D7006 llvm-svn: 226795
llvm · Jan 22, 2015 · df7a8e2 · df7a8e2
1 parent 3f68fae
commit df7a8e2
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 3 deletions.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -848,6 +848,12 @@ bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
   return Schedule == OMP_sch_static;
 }
 
+bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
+  auto Schedule = getRuntimeSchedule(ScheduleKind, /* Chunked */ false);
+  assert(Schedule != OMP_sch_static_chunked && "cannot be chunked here");
+  return Schedule != OMP_sch_static;
+}
+
 void CGOpenMPRuntime::EmitOMPForInit(CodeGenFunction &CGF, SourceLocation Loc,
                                      OpenMPScheduleClauseKind ScheduleKind,
                                      unsigned IVSize, bool IVSigned,

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -320,6 +320,12 @@ class CGOpenMPRuntime {
   virtual bool isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
                                   bool Chunked) const;
 
+  /// \brief Check if the specified \a ScheduleKind is dynamic.
+  /// This kind of worksharing directive is emitted without outer loop.
+  /// \param ScheduleKind Schedule Kind specified in the 'schedule' clause.
+  ///
+  virtual bool isDynamic(OpenMPScheduleClauseKind ScheduleKind) const;
+
   /// \brief Call the appropriate runtime routine to initialize it before start
   /// of loop.
   ///

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -500,6 +500,89 @@ void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
     DI->EmitLexicalBlockEnd(Builder, S.getSourceRange().getEnd());
 }
 
+void CodeGenFunction::EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind,
+                                          const OMPLoopDirective &S,
+                                          OMPPrivateScope &LoopScope,
+                                          llvm::Value *LB, llvm::Value *UB,
+                                          llvm::Value *ST, llvm::Value *IL,
+                                          llvm::Value *Chunk) {
+  auto &RT = CGM.getOpenMPRuntime();
+  assert(!RT.isStaticNonchunked(ScheduleKind, /* Chunked */ Chunk != nullptr) &&
+         "static non-chunked schedule does not need outer loop");
+  if (RT.isDynamic(ScheduleKind)) {
+    ErrorUnsupported(&S, "OpenMP loop with dynamic schedule");
+    return;
+  }
+
+  // Emit outer loop.
+  //
+  // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
+  // When schedule(static, chunk_size) is specified, iterations are divided into
+  // chunks of size chunk_size, and the chunks are assigned to the threads in
+  // the team in a round-robin fashion in the order of the thread number.
+  //
+  // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) {
+  //   while (idx <= UB) { BODY; ++idx; } // inner loop
+  //   LB = LB + ST;
+  //   UB = UB + ST;
+  // }
+  //
+  const Expr *IVExpr = S.getIterationVariable();
+  const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
+  const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
+
+  RT.EmitOMPForInit(*this, S.getLocStart(), ScheduleKind, IVSize, IVSigned, IL,
+                    LB, UB, ST, Chunk);
+  auto LoopExit = getJumpDestInCurrentScope("omp.dispatch.end");
+
+  // Start the loop with a block that tests the condition.
+  auto CondBlock = createBasicBlock("omp.dispatch.cond");
+  EmitBlock(CondBlock);
+  LoopStack.push(CondBlock);
+
+  llvm::Value *BoolCondVal = nullptr;
+  // UB = min(UB, GlobalUB)
+  EmitIgnoredExpr(S.getEnsureUpperBound());
+  // IV = LB
+  EmitIgnoredExpr(S.getInit());
+  // IV < UB
+  BoolCondVal = EvaluateExprAsBool(S.getCond(false));
+
+  // If there are any cleanups between here and the loop-exit scope,
+  // create a block to stage a loop exit along.
+  auto ExitBlock = LoopExit.getBlock();
+  if (LoopScope.requiresCleanups())
+    ExitBlock = createBasicBlock("omp.dispatch.cleanup");
+
+  auto LoopBody = createBasicBlock("omp.dispatch.body");
+  Builder.CreateCondBr(BoolCondVal, LoopBody, ExitBlock);
+  if (ExitBlock != LoopExit.getBlock()) {
+    EmitBlock(ExitBlock);
+    EmitBranchThroughCleanup(LoopExit);
+  }
+  EmitBlock(LoopBody);
+
+  // Create a block for the increment.
+  auto Continue = getJumpDestInCurrentScope("omp.dispatch.inc");
+  BreakContinueStack.push_back(BreakContinue(LoopExit, Continue));
+
+  EmitOMPInnerLoop(S, LoopScope);
+
+  EmitBlock(Continue.getBlock());
+  BreakContinueStack.pop_back();
+  // Emit "LB = LB + Stride", "UB = UB + Stride".
+  EmitIgnoredExpr(S.getNextLowerBound());
+  EmitIgnoredExpr(S.getNextUpperBound());
+
+  EmitBranch(CondBlock);
+  LoopStack.pop();
+  // Emit the fall-through block.
+  EmitBlock(LoopExit.getBlock());
+
+  // Tell the runtime we are done.
+  RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind);
+}
+
 /// \brief Emit a helper variable and return corresponding lvalue.
 static LValue EmitOMPHelperVar(CodeGenFunction &CGF,
                                const DeclRefExpr *Helper) {
@@ -581,8 +664,13 @@ void CodeGenFunction::EmitOMPWorksharingLoop(const OMPLoopDirective &S) {
         EmitOMPInnerLoop(S, LoopScope);
         // Tell the runtime we are done.
         RT.EmitOMPForFinish(*this, S.getLocStart(), ScheduleKind);
-      } else
-        ErrorUnsupported(&S, "OpenMP loop with requested schedule");
+      } else {
+        // Emit the outer loop, which requests its work chunk [LB..UB] from
+        // runtime and runs the inner loop to process it.
+        EmitOMPForOuterLoop(ScheduleKind, S, LoopScope, LB.getAddress(),
+                            UB.getAddress(), ST.getAddress(), IL.getAddress(),
+                            Chunk);
+      }
     }
     // We're now done with the loop, so jump to the continuation block.
     EmitBranch(ContBlock);

diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
@@ -27,6 +27,7 @@
 #include "clang/AST/Type.h"
 #include "clang/Basic/ABI.h"
 #include "clang/Basic/CapturedStmt.h"
+#include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -2052,6 +2053,11 @@ class CodeGenFunction : public CodeGenTypeCache {
                         bool SeparateIter = false);
   void EmitOMPSimdFinal(const OMPLoopDirective &S);
   void EmitOMPWorksharingLoop(const OMPLoopDirective &S);
+  void EmitOMPForOuterLoop(OpenMPScheduleClauseKind ScheduleKind,
+                           const OMPLoopDirective &S,
+                           OMPPrivateScope &LoopScope, llvm::Value *LB,
+                           llvm::Value *UB, llvm::Value *ST, llvm::Value *IL,
+                           llvm::Value *Chunk);
 
 public:
 

diff --git a/clang/test/OpenMP/for_codegen.cpp b/clang/test/OpenMP/for_codegen.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 //
@@ -87,5 +87,64 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
 // CHECK: ret void
 }
 
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_chunked(float *a, float *b, float *c, float *d) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
+  #pragma omp for schedule(static, 5)
+// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
+// UB = min(UB, GlobalUB)
+// CHECK: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
+// CHECK-NEXT: br i1 [[UBCMP]], label [[UB_TRUE:%[^,]+]], label [[UB_FALSE:%[^,]+]]
+// CHECK: [[UBRESULT:%.+]] = phi i32 [ 16908288, [[UB_TRUE]] ], [ [[UBVAL:%[^,]+]], [[UB_FALSE]] ]
+// CHECK-NEXT: store i32 [[UBRESULT]], i32* [[OMP_UB]]
+// CHECK-NEXT: [[LB:%.+]] = load i32* [[OMP_LB]]
+// CHECK-NEXT: store i32 [[LB]], i32* [[OMP_IV:[^,]+]]
+
+// Outer loop header
+// CHECK: [[O_IV:%.+]] = load i32* [[OMP_IV]]
+// CHECK-NEXT: [[O_UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK-NEXT: [[O_CMP:%.+]] = icmp ule i32 [[O_IV]], [[O_UB]]
+// CHECK-NEXT: br i1 [[O_CMP]], label %[[O_LOOP1_BODY:[^,]+]], label %[[O_LOOP1_END:[^,]+]]
+
+// Loop header
+// CHECK: [[O_LOOP1_BODY]]
+// CHECK: [[IV:%.+]] = load i32* [[OMP_IV]]
+// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK-NEXT: [[CMP:%.+]] = icmp ule i32 [[IV]], [[UB]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[LOOP1_BODY:[^,]+]], label %[[LOOP1_END:[^,]+]]
+  for (unsigned i = 131071; i <= 2147483647; i += 127) {
+// CHECK: [[LOOP1_BODY]]
+// Start of body: calculate i from IV:
+// CHECK: [[IV1_1:%.+]] = load i32* [[OMP_IV]]
+// CHECK-NEXT: [[CALC_I_1:%.+]] = mul i32 [[IV1_1]], 127
+// CHECK-NEXT: [[CALC_I_2:%.+]] = add i32 131071, [[CALC_I_1]]
+// CHECK-NEXT: store i32 [[CALC_I_2]], i32* [[LC_I:.+]]
+// ... loop body ...
+// End of body: store into a[i]:
+// CHECK: store float [[RESULT:%.+]], float* {{%.+}}
+    a[i] = b[i] * c[i] * d[i];
+// CHECK: [[IV1_2:%.+]] = load i32* [[OMP_IV]]{{.*}}
+// CHECK-NEXT: [[ADD1_2:%.+]] = add i32 [[IV1_2]], 1
+// CHECK-NEXT: store i32 [[ADD1_2]], i32* [[OMP_IV]]
+// CHECK-NEXT: br label %{{.+}}
+  }
+// CHECK: [[LOOP1_END]]
+// Update the counters, adding stride
+// CHECK:  [[LB:%.+]] = load i32* [[OMP_LB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_LB:%.+]] = add i32 [[LB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_LB]], i32* [[OMP_LB]]
+// CHECK-NEXT: [[UB:%.+]] = load i32* [[OMP_UB]]
+// CHECK-NEXT: [[ST:%.+]] = load i32* [[OMP_ST]]
+// CHECK-NEXT: [[ADD_UB:%.+]] = add i32 [[UB]], [[ST]]
+// CHECK-NEXT: store i32 [[ADD_UB]], i32* [[OMP_UB]]
+
+// CHECK: [[O_LOOP1_END]]
+// CHECK: call void @__kmpc_for_static_fini([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK: call {{.+}} @__kmpc_cancel_barrier([[IDENT_T_TY]]* [[DEFAULT_LOC_BARRIER:[@%].+]], i32 [[GTID]])
+// CHECK: ret void
+}
+
 #endif // HEADER