3 changes: 2 additions & 1 deletion polly/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ add_library(PollyCore OBJECT
CodeGen/BlockGenerators.cpp
${ISL_CODEGEN_FILES}
CodeGen/LoopGenerators.cpp
CodeGen/LoopGeneratorsGOMP.cpp
CodeGen/LoopGeneratorsKMP.cpp
CodeGen/IRBuilder.cpp
CodeGen/Utils.cpp
CodeGen/RuntimeDebugBuilder.cpp
Expand Down Expand Up @@ -158,4 +160,3 @@ if (TARGET intrinsics_gen)
# Check if we are building as part of an LLVM build
add_dependencies(PollyCore intrinsics_gen)
endif()

29 changes: 25 additions & 4 deletions polly/lib/CodeGen/IslNodeBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/CodeGen/IslAst.h"
#include "polly/CodeGen/IslExprBuilder.h"
#include "polly/CodeGen/LoopGenerators.h"
#include "polly/CodeGen/LoopGeneratorsGOMP.h"
#include "polly/CodeGen/LoopGeneratorsKMP.h"
#include "polly/CodeGen/RuntimeDebugBuilder.h"
#include "polly/Config/config.h"
#include "polly/Options.h"
Expand Down Expand Up @@ -80,6 +81,9 @@ STATISTIC(ParallelLoops, "Number of generated parallel for-loops");
STATISTIC(VectorLoops, "Number of generated vector for-loops");
STATISTIC(IfConditions, "Number of generated if-conditions");

/// OpenMP backend options
enum class OpenMPBackend { GNU, LLVM };

static cl::opt<bool> PollyGenerateRTCPrint(
"polly-codegen-emit-rtc-print",
cl::desc("Emit code that prints the runtime check result dynamically."),
Expand All @@ -99,6 +103,12 @@ static cl::opt<int> PollyTargetFirstLevelCacheLineSize(
cl::desc("The size of the first level cache line size specified in bytes."),
cl::Hidden, cl::init(64), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<OpenMPBackend> PollyOmpBackend(
"polly-omp-backend", cl::desc("Choose the OpenMP library to use:"),
cl::values(clEnumValN(OpenMPBackend::GNU, "GNU", "GNU OpenMP"),
clEnumValN(OpenMPBackend::LLVM, "LLVM", "LLVM OpenMP")),
cl::Hidden, cl::init(OpenMPBackend::GNU), cl::cat(PollyCategory));

isl::ast_expr IslNodeBuilder::getUpperBound(isl::ast_node For,
ICmpInst::Predicate &Predicate) {
isl::ast_expr Cond = For.for_get_cond();
Expand Down Expand Up @@ -668,10 +678,21 @@ void IslNodeBuilder::createForParallel(__isl_take isl_ast_node *For) {
}

ValueMapT NewValues;
ParallelLoopGenerator ParallelLoopGen(Builder, LI, DT, DL);

IV = ParallelLoopGen.createParallelLoop(ValueLB, ValueUB, ValueInc,
SubtreeValues, NewValues, &LoopBody);
std::unique_ptr<ParallelLoopGenerator> ParallelLoopGenPtr;

switch (PollyOmpBackend) {
case OpenMPBackend::GNU:
ParallelLoopGenPtr.reset(
new ParallelLoopGeneratorGOMP(Builder, LI, DT, DL));
break;
case OpenMPBackend::LLVM:
ParallelLoopGenPtr.reset(new ParallelLoopGeneratorKMP(Builder, LI, DT, DL));
break;
}

IV = ParallelLoopGenPtr->createParallelLoop(
ValueLB, ValueUB, ValueInc, SubtreeValues, NewValues, &LoopBody);
BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
Builder.SetInsertPoint(&*LoopBody);

Expand Down
208 changes: 40 additions & 168 deletions polly/lib/CodeGen/LoopGenerators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
//
//===----------------------------------------------------------------------===//
//
// This file contains functions to create scalar and parallel loops as LLVM-IR.
// This file contains functions to create scalar loops and orchestrate the
// creation of parallel loops as LLVM-IR.
//
//===----------------------------------------------------------------------===//

#include "polly/CodeGen/LoopGenerators.h"
#include "polly/Options.h"
#include "polly/ScopDetection.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/DataLayout.h"
Expand All @@ -22,10 +24,36 @@
using namespace llvm;
using namespace polly;

static cl::opt<int>
PollyNumThreads("polly-num-threads",
cl::desc("Number of threads to use (0 = auto)"), cl::Hidden,
cl::init(0));
int polly::PollyNumThreads;
OMPGeneralSchedulingType polly::PollyScheduling;
int polly::PollyChunkSize;

static cl::opt<int, true>
XPollyNumThreads("polly-num-threads",
cl::desc("Number of threads to use (0 = auto)"),
cl::Hidden, cl::location(polly::PollyNumThreads),
cl::init(0), cl::cat(PollyCategory));

static cl::opt<OMPGeneralSchedulingType, true> XPollyScheduling(
"polly-scheduling",
cl::desc("Scheduling type of parallel OpenMP for loops"),
cl::values(clEnumValN(OMPGeneralSchedulingType::StaticChunked, "static",
"Static scheduling"),
clEnumValN(OMPGeneralSchedulingType::Dynamic, "dynamic",
"Dynamic scheduling"),
clEnumValN(OMPGeneralSchedulingType::Guided, "guided",
"Guided scheduling"),
clEnumValN(OMPGeneralSchedulingType::Runtime, "runtime",
"Runtime determined (OMP_SCHEDULE)")),
cl::Hidden, cl::location(polly::PollyScheduling),
cl::init(OMPGeneralSchedulingType::Runtime), cl::Optional,
cl::cat(PollyCategory));

static cl::opt<int, true>
XPollyChunkSize("polly-scheduling-chunksize",
cl::desc("Chunksize to use by the OpenMP runtime calls"),
cl::Hidden, cl::location(polly::PollyChunkSize),
cl::init(0), cl::Optional, cl::cat(PollyCategory));

// We generate a loop of either of the following structures:
//
Expand Down Expand Up @@ -147,11 +175,13 @@ Value *polly::createLoop(Value *LB, Value *UB, Value *Stride,
Value *ParallelLoopGenerator::createParallelLoop(
Value *LB, Value *UB, Value *Stride, SetVector<Value *> &UsedValues,
ValueMapT &Map, BasicBlock::iterator *LoopBody) {
Function *SubFn;

AllocaInst *Struct = storeValuesIntoStruct(UsedValues);
BasicBlock::iterator BeforeLoop = Builder.GetInsertPoint();
Value *IV = createSubFn(Stride, Struct, UsedValues, Map, &SubFn);

Value *IV;
Function *SubFn;
std::tie(IV, SubFn) = createSubFn(Stride, Struct, UsedValues, Map);
*LoopBody = Builder.GetInsertPoint();
Builder.SetInsertPoint(&*BeforeLoop);

Expand All @@ -162,102 +192,15 @@ Value *ParallelLoopGenerator::createParallelLoop(
// whereas the codegenForSequential function creates a <= comparison.
UB = Builder.CreateAdd(UB, ConstantInt::get(LongType, 1));

// Tell the runtime we start a parallel loop
createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
Builder.CreateCall(SubFn, SubFnParam);
createCallJoinThreads();
// Execute the prepared subfunction in parallel.
deployParallelExecution(SubFn, SubFnParam, LB, UB, Stride);

return IV;
}

void ParallelLoopGenerator::createCallSpawnThreads(Value *SubFn,
Value *SubFnParam, Value *LB,
Value *UB, Value *Stride) {
const std::string Name = "GOMP_parallel_loop_runtime_start";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;

Type *Params[] = {PointerType::getUnqual(FunctionType::get(
Builder.getVoidTy(), Builder.getInt8PtrTy(), false)),
Builder.getInt8PtrTy(),
Builder.getInt32Ty(),
LongType,
LongType,
LongType};

FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}

Value *NumberOfThreads = Builder.getInt32(PollyNumThreads);
Value *Args[] = {SubFn, SubFnParam, NumberOfThreads, LB, UB, Stride};

Builder.CreateCall(F, Args);
}

Value *ParallelLoopGenerator::createCallGetWorkItem(Value *LBPtr,
Value *UBPtr) {
const std::string Name = "GOMP_loop_runtime_next";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {LongType->getPointerTo(), LongType->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt8Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}

Value *Args[] = {LBPtr, UBPtr};
Value *Return = Builder.CreateCall(F, Args);
Return = Builder.CreateICmpNE(
Return, Builder.CreateZExt(Builder.getFalse(), Return->getType()));
return Return;
}

void ParallelLoopGenerator::createCallJoinThreads() {
const std::string Name = "GOMP_parallel_end";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;

FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
F = Function::Create(Ty, Linkage, Name, M);
}

Builder.CreateCall(F, {});
}

void ParallelLoopGenerator::createCallCleanupThread() {
const std::string Name = "GOMP_loop_end_nowait";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;

FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
F = Function::Create(Ty, Linkage, Name, M);
}

Builder.CreateCall(F, {});
}

Function *ParallelLoopGenerator::createSubFnDefinition() {
Function *F = Builder.GetInsertBlock()->getParent();
std::vector<Type *> Arguments(1, Builder.getInt8PtrTy());
FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
Function *SubFn = Function::Create(FT, Function::InternalLinkage,
F->getName() + "_polly_subfn", M);
Function *SubFn = prepareSubFnDefinition(F);

// Certain backends (e.g., NVPTX) do not support '.'s in function names.
// Hence, we ensure that all '.'s are replaced by '_'s.
Expand All @@ -268,9 +211,6 @@ Function *ParallelLoopGenerator::createSubFnDefinition() {
// Do not run any polly pass on the new function.
SubFn->addFnAttr(PollySkipFnAttr);

Function::arg_iterator AI = SubFn->arg_begin();
AI->setName("polly.par.userContext");

return SubFn;
}

Expand Down Expand Up @@ -310,71 +250,3 @@ void ParallelLoopGenerator::extractValuesFromStruct(
Map[OldValues[i]] = NewValue;
}
}

Value *ParallelLoopGenerator::createSubFn(Value *Stride, AllocaInst *StructData,
SetVector<Value *> Data,
ValueMapT &Map, Function **SubFnPtr) {
BasicBlock *PrevBB, *HeaderBB, *ExitBB, *CheckNextBB, *PreHeaderBB, *AfterBB;
Value *LBPtr, *UBPtr, *UserContext, *Ret1, *HasNextSchedule, *LB, *UB, *IV;
Function *SubFn = createSubFnDefinition();
LLVMContext &Context = SubFn->getContext();

// Store the previous basic block.
PrevBB = Builder.GetInsertBlock();

// Create basic blocks.
HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
CheckNextBB = BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
PreHeaderBB = BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);

DT.addNewBlock(HeaderBB, PrevBB);
DT.addNewBlock(ExitBB, HeaderBB);
DT.addNewBlock(CheckNextBB, HeaderBB);
DT.addNewBlock(PreHeaderBB, HeaderBB);

// Fill up basic block HeaderBB.
Builder.SetInsertPoint(HeaderBB);
LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
UserContext = Builder.CreateBitCast(
&*SubFn->arg_begin(), StructData->getType(), "polly.par.userContext");

extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext,
Map);
Builder.CreateBr(CheckNextBB);

// Add code to check if another set of iterations will be executed.
Builder.SetInsertPoint(CheckNextBB);
Ret1 = createCallGetWorkItem(LBPtr, UBPtr);
HasNextSchedule = Builder.CreateTrunc(Ret1, Builder.getInt1Ty(),
"polly.par.hasNextScheduleBlock");
Builder.CreateCondBr(HasNextSchedule, PreHeaderBB, ExitBB);

// Add code to load the iv bounds for this set of iterations.
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateLoad(LBPtr, "polly.par.LB");
UB = Builder.CreateLoad(UBPtr, "polly.par.UB");

// Subtract one as the upper bound provided by OpenMP is a < comparison
// whereas the codegenForSequential function creates a <= comparison.
UB = Builder.CreateSub(UB, ConstantInt::get(LongType, 1),
"polly.par.UBAdjusted");

Builder.CreateBr(CheckNextBB);
Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
IV = createLoop(LB, UB, Stride, Builder, LI, DT, AfterBB, ICmpInst::ICMP_SLE,
nullptr, true, /* UseGuard */ false);

BasicBlock::iterator LoopBody = Builder.GetInsertPoint();

// Add code to terminate this subfunction.
Builder.SetInsertPoint(ExitBB);
createCallCleanupThread();
Builder.CreateRetVoid();

Builder.SetInsertPoint(&*LoopBody);
*SubFnPtr = SubFn;

return IV;
}
228 changes: 228 additions & 0 deletions polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
//===------ LoopGeneratorsGOMP.cpp - IR helper to create loops ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains functions to create parallel loops as LLVM-IR.
//
//===----------------------------------------------------------------------===//

#include "polly/CodeGen/LoopGeneratorsGOMP.h"
#include "polly/ScopDetection.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

using namespace llvm;
using namespace polly;

void ParallelLoopGeneratorGOMP::createCallSpawnThreads(Value *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
const std::string Name = "GOMP_parallel_loop_runtime_start";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;

Type *Params[] = {PointerType::getUnqual(FunctionType::get(
Builder.getVoidTy(), Builder.getInt8PtrTy(), false)),
Builder.getInt8PtrTy(),
Builder.getInt32Ty(),
LongType,
LongType,
LongType};

FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}

Value *Args[] = {SubFn, SubFnParam, Builder.getInt32(PollyNumThreads),
LB, UB, Stride};

Builder.CreateCall(F, Args);
}

void ParallelLoopGeneratorGOMP::deployParallelExecution(Value *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
// Tell the runtime we start a parallel loop
createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
Builder.CreateCall(SubFn, SubFnParam);
createCallJoinThreads();
}

Function *ParallelLoopGeneratorGOMP::prepareSubFnDefinition(Function *F) const {
FunctionType *FT =
FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);
Function *SubFn = Function::Create(FT, Function::InternalLinkage,
F->getName() + "_polly_subfn", M);
// Name the function's arguments
SubFn->arg_begin()->setName("polly.par.userContext");
return SubFn;
}

// Create a subfunction of the following (preliminary) structure:
//
// PrevBB
// |
// v
// HeaderBB
// | _____
// v v |
// CheckNextBB PreHeaderBB
// |\ |
// | \______/
// |
// v
// ExitBB
//
// HeaderBB will hold allocations and loading of variables.
// CheckNextBB will check for more work.
// If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB.
// PreHeaderBB loads the new boundaries (& will lead to the loop body later on).
// ExitBB marks the end of the parallel execution.
std::tuple<Value *, Function *>
ParallelLoopGeneratorGOMP::createSubFn(Value *Stride, AllocaInst *StructData,
SetVector<Value *> Data,
ValueMapT &Map) {
if (PollyScheduling != OMPGeneralSchedulingType::Runtime) {
// User tried to influence the scheduling type (currently not supported)
errs() << "warning: Polly's GNU OpenMP backend solely "
"supports the scheduling type 'runtime'.\n";
}

if (PollyChunkSize != 0) {
// User tried to influence the chunk size (currently not supported)
errs() << "warning: Polly's GNU OpenMP backend solely "
"supports the default chunk size.\n";
}

Function *SubFn = createSubFnDefinition();
LLVMContext &Context = SubFn->getContext();

// Store the previous basic block.
BasicBlock *PrevBB = Builder.GetInsertBlock();

// Create basic blocks.
BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
BasicBlock *CheckNextBB =
BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
BasicBlock *PreHeaderBB =
BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);

DT.addNewBlock(HeaderBB, PrevBB);
DT.addNewBlock(ExitBB, HeaderBB);
DT.addNewBlock(CheckNextBB, HeaderBB);
DT.addNewBlock(PreHeaderBB, HeaderBB);

// Fill up basic block HeaderBB.
Builder.SetInsertPoint(HeaderBB);
Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
Value *UserContext = Builder.CreateBitCast(
&*SubFn->arg_begin(), StructData->getType(), "polly.par.userContext");

extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext,
Map);
Builder.CreateBr(CheckNextBB);

// Add code to check if another set of iterations will be executed.
Builder.SetInsertPoint(CheckNextBB);
Value *Next = createCallGetWorkItem(LBPtr, UBPtr);
Value *HasNextSchedule = Builder.CreateTrunc(
Next, Builder.getInt1Ty(), "polly.par.hasNextScheduleBlock");
Builder.CreateCondBr(HasNextSchedule, PreHeaderBB, ExitBB);

// Add code to load the iv bounds for this set of iterations.
Builder.SetInsertPoint(PreHeaderBB);
Value *LB = Builder.CreateLoad(LBPtr, "polly.par.LB");
Value *UB = Builder.CreateLoad(UBPtr, "polly.par.UB");

// Subtract one as the upper bound provided by OpenMP is a < comparison
// whereas the codegenForSequential function creates a <= comparison.
UB = Builder.CreateSub(UB, ConstantInt::get(LongType, 1),
"polly.par.UBAdjusted");

Builder.CreateBr(CheckNextBB);
Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
BasicBlock *AfterBB;
Value *IV =
createLoop(LB, UB, Stride, Builder, LI, DT, AfterBB, ICmpInst::ICMP_SLE,
nullptr, true, /* UseGuard */ false);

BasicBlock::iterator LoopBody = Builder.GetInsertPoint();

// Add code to terminate this subfunction.
Builder.SetInsertPoint(ExitBB);
createCallCleanupThread();
Builder.CreateRetVoid();

Builder.SetInsertPoint(&*LoopBody);

return std::make_tuple(IV, SubFn);
}

Value *ParallelLoopGeneratorGOMP::createCallGetWorkItem(Value *LBPtr,
Value *UBPtr) {
const std::string Name = "GOMP_loop_runtime_next";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {LongType->getPointerTo(), LongType->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt8Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}

Value *Args[] = {LBPtr, UBPtr};
Value *Return = Builder.CreateCall(F, Args);
Return = Builder.CreateICmpNE(
Return, Builder.CreateZExt(Builder.getFalse(), Return->getType()));
return Return;
}

void ParallelLoopGeneratorGOMP::createCallJoinThreads() {
const std::string Name = "GOMP_parallel_end";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;

FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
F = Function::Create(Ty, Linkage, Name, M);
}

Builder.CreateCall(F, {});
}

void ParallelLoopGeneratorGOMP::createCallCleanupThread() {
const std::string Name = "GOMP_loop_end_nowait";

Function *F = M->getFunction(Name);

// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;

FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
F = Function::Create(Ty, Linkage, Name, M);
}

Builder.CreateCall(F, {});
}
512 changes: 512 additions & 0 deletions polly/lib/CodeGen/LoopGeneratorsKMP.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
; RUN: opt %loadPolly -polly-parallel \
; RUN: -polly-parallel-force -polly-codegen -S -verify-dom-info < %s \
; RUN: -polly-parallel-force -polly-codegen \
; RUN: -S -verify-dom-info < %s \
; RUN: | FileCheck %s -check-prefix=IR
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

; RUN: opt %loadPolly -polly-parallel \
; RUN: -polly-parallel-force -polly-codegen -polly-scheduling=runtime \
; RUN: -S -verify-dom-info < %s \
; RUN: | FileCheck %s -check-prefix=IR

; RUN: opt %loadPolly -polly-parallel \
; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
; RUN: -S -verify-dom-info < %s \
; RUN: | FileCheck %s -check-prefix=LIBOMP-IR

; IR: @GOMP_parallel_loop_runtime_start

; LIBOMP-IR: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call
; LIBOMP-IR: call void @__kmpc_dispatch_init_{{[4|8]}}

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@longLimit = external global [9 x [23 x i32]], align 16
@shortLimit = external global [9 x [14 x i32]], align 16

Expand Down
91 changes: 90 additions & 1 deletion polly/test/Isl/CodeGen/OpenMP/single_loop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@
; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-ast -analyze < %s | FileCheck %s -check-prefix=AST-STRIDE4
; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-codegen -S < %s | FileCheck %s -check-prefix=IR-STRIDE4

; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR
; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-codegen -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4

; This extensive test case tests the creation of the full set of OpenMP calls
; as well as the subfunction creation using a trivial loop as example.

;
; #define N 1024
; float A[N];
;
Expand Down Expand Up @@ -83,6 +88,90 @@
; IR-STRIDE4: %polly.indvar_next = add nsw i64 %polly.indvar, 4
; IR-STRIDE4 %polly.adjust_ub = sub i64 %polly.par.UBAdjusted, 4

; LIBOMP-IR: %struct.ident_t = type { i32, i32, i32, i32, i8* }

; LIBOMP-IR-LABEL: single_parallel_loop()
; LIBOMP-IR-NEXT: entry
; LIBOMP-IR-NEXT: %polly.par.userContext = alloca

; LIBOMP-IR-LABEL: polly.parallel.for:
; LIBOMP-IR-NEXT: %polly.par.userContext1 = bitcast {}* %polly.par.userContext to i8*
; LIBOMP-IR-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @.loc.dummy, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i64, i8*)* @single_parallel_loop_polly_subfn to void (i32*, i32*, ...)*), i64 0, i64 1024, i64 1, i8* %polly.par.userContext1)
; LIBOMP-IR-NEXT: br label %polly.exiting

; LIBOMP-IR: define internal void @single_parallel_loop_polly_subfn(i32* %polly.kmpc.global_tid, i32* %polly.kmpc.bound_tid, i64 %polly.kmpc.lb, i64 %polly.kmpc.ub, i64 %polly.kmpc.inc, i8* %polly.kmpc.shared)
; LIBOMP-IR-LABEL: polly.par.setup:
; LIBOMP-IR-NEXT: %polly.par.LBPtr = alloca i64
; LIBOMP-IR-NEXT: %polly.par.UBPtr = alloca i64
; LIBOMP-IR-NEXT: %polly.par.lastIterPtr = alloca i32
; LIBOMP-IR-NEXT: %polly.par.StridePtr = alloca i64
; LIBOMP-IR-NEXT: %polly.par.userContext = bitcast i8* %polly.kmpc.shared
; LIBOMP-IR-NEXT: %polly.par.global_tid = load i32, i32* %polly.kmpc.global_tid
; LIBOMP-IR-NEXT: store i64 %polly.kmpc.lb, i64* %polly.par.LBPtr
; LIBOMP-IR-NEXT: store i64 %polly.kmpc.ub, i64* %polly.par.UBPtr
; LIBOMP-IR-NEXT: store i32 0, i32* %polly.par.lastIterPtr
; LIBOMP-IR-NEXT: store i64 %polly.kmpc.inc, i64* %polly.par.StridePtr
; LIBOMP-IR-NEXT: %polly.indvar.UBAdjusted = add i64 %polly.kmpc.ub, -1
; LIBOMP-IR-NEXT: call void @__kmpc_for_static_init_{{[4|8]}}(%struct.ident_t* @.loc.dummy{{[.0-9]*}}, i32 %polly.par.global_tid, i32 33, i32* %polly.par.lastIterPtr, i64* %polly.par.LBPtr, i64* %polly.par.UBPtr, i64* %polly.par.StridePtr, i64 1, i64 43)
; LIBOMP-IR-NEXT: %polly.indvar.LB = load i64, i64* %polly.par.LBPtr
; LIBOMP-IR-NEXT: %polly.indvar.UB = load i64, i64* %polly.par.UBPtr
; LIBOMP-IR-NEXT: %polly.adjustedUBOutOfBounds = icmp slt i64 %polly.indvar.UB, %polly.indvar.UBAdjusted
; LIBOMP-IR-NEXT: %{{[0-9]+}} = select i1 %polly.adjustedUBOutOfBounds, i64 %polly.indvar.UB, i64 %polly.indvar.UBAdjusted
; LIBOMP-IR-NEXT: store i64 %{{[0-9]+}}, i64* %polly.par.UBPtr
; LIBOMP-IR-NEXT: %polly.hasIteration = icmp sle i64 %polly.indvar.LB, %{{[0-9]+}}
; LIBOMP-IR: br i1 %polly.hasIteration, label %polly.par.loadIVBounds, label %polly.par.exit

; LIBOMP-IR-LABEL: polly.par.exit:
; LIBOMP-IR-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @.loc.dummy, i32 %polly.par.global_tid)
; LIBOMP-IR-NEXT: ret void

; LIBOMP-IR-LABEL: polly.par.checkNext:
; LIBOMP-IR-NEXT: br label %polly.par.exit

; LIBOMP-IR-LABEL: polly.par.loadIVBounds:
; LIBOMP-IR-NEXT: br label %polly.loop_preheader

; LIBOMP-IR-LABEL: polly.loop_exit:
; LIBOMP-IR-NEXT: br label %polly.par.checkNext

; LIBOMP-IR-LABEL: polly.loop_header:
; LIBOMP-IR-NEXT: %polly.indvar = phi i64 [ %polly.indvar.LB, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.S ]
; LIBOMP-IR-NEXT: br label %polly.stmt.S

; LIBOMP-IR-LABEL: polly.stmt.S:
; LIBOMP-IR-NEXT: %[[gep:[._a-zA-Z0-9]*]] = getelementptr [1024 x float], [1024 x float]* {{.*}}, i64 0, i64 %polly.indvar
; LIBOMP-IR-NEXT: store float 1.000000e+00, float* %[[gep]]
; LIBOMP-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, %polly.kmpc.inc
; LIBOMP-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, %{{[0-9]+}}
; LIBOMP-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit

; LIBOMP-IR-LABEL: polly.loop_preheader:
; LIBOMP-IR-NEXT: br label %polly.loop_header

; LIBOMP-IR: attributes #1 = { "polly.skip.fn" }

; LIBOMP-IR-DYNAMIC: call void @__kmpc_dispatch_init_{{[4|8]}}(%struct.ident_t* @.loc.dummy, i32 %polly.par.global_tid, i32 35, i64 %polly.kmpc.lb, i64 %polly.indvar.UBAdjusted, i64 %polly.kmpc.inc, i64 1)
; LIBOMP-IR-DYNAMIC-NEXT: %{{[0-9]+}} = call i32 @__kmpc_dispatch_next_{{[4|8]}}(%struct.ident_t* @.loc.dummy, i32 %polly.par.global_tid, i32* %polly.par.lastIterPtr, i64* %polly.par.LBPtr, i64* %polly.par.UBPtr, i64* %polly.par.StridePtr)
; LIBOMP-IR-DYNAMIC-NEXT: %polly.hasIteration = icmp eq i32 %{{[0-9]+}}, 1
; LIBOMP-IR-DYNAMIC-NEXT: br i1 %polly.hasIteration, label %polly.par.loadIVBounds, label %polly.par.exit

; LIBOMP-IR-DYNAMIC-LABEL: polly.par.exit:
; LIBOMP-IR-DYNAMIC-NEXT: ret void

; LIBOMP-IR-DYNAMIC-LABEL: polly.par.checkNext:
; LIBOMP-IR-DYNAMIC-NEXT: %{{[0-9]+}} = call i32 @__kmpc_dispatch_next_{{[4|8]}}(%struct.ident_t* @.loc.dummy, i32 %polly.par.global_tid, i32* %polly.par.lastIterPtr, i64* %polly.par.LBPtr, i64* %polly.par.UBPtr, i64* %polly.par.StridePtr)
; LIBOMP-IR-DYNAMIC-NEXT: %polly.hasWork = icmp eq i32 %{{[0-9]+}}, 1
; LIBOMP-IR-DYNAMIC-NEXT: br i1 %polly.hasWork, label %polly.par.loadIVBounds, label %polly.par.exit

; LIBOMP-IR-DYNAMIC-LABEL: polly.par.loadIVBounds:
; LIBOMP-IR-DYNAMIC-NEXT: %polly.indvar.LB = load i64, i64* %polly.par.LBPtr
; LIBOMP-IR-DYNAMIC-NEXT: %polly.indvar.UB = load i64, i64* %polly.par.UBPtr
; LIBOMP-IR-DYNAMIC-NEXT: br label %polly.loop_preheader

; LIBOMP-IR-DYNAMIC-FOUR: call void @__kmpc_dispatch_init_{{[4|8]}}(%struct.ident_t* @.loc.dummy, i32 %polly.par.global_tid, i32 35, i64 %polly.kmpc.lb, i64 %polly.indvar.UBAdjusted, i64 %polly.kmpc.inc, i64 4)

; LIBOMP-IR-STRIDE4: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @.loc.dummy, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i64, i8*)* @single_parallel_loop_polly_subfn to void (i32*, i32*, ...)*), i64 0, i64 1024, i64 4, i8* %polly.par.userContext1)

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"

@A = common global [1024 x float] zeroinitializer, align 16
Expand Down
32 changes: 27 additions & 5 deletions polly/test/Isl/CodeGen/OpenMP/single_loop_with_param.ll
Original file line number Diff line number Diff line change
@@ -1,24 +1,46 @@
; RUN: opt %loadPolly -polly-parallel \
; RUN: -polly-parallel-force -polly-codegen -S -verify-dom-info < %s \
; RUN: -polly-parallel-force -polly-codegen \
; RUN: -S -verify-dom-info < %s \
; RUN: | FileCheck %s -check-prefix=IR

; RUN: opt %loadPolly -polly-parallel \
; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
; RUN: -S -verify-dom-info < %s \
; RUN: | FileCheck %s -check-prefix=LIBOMP-IR

; RUN: opt %loadPolly -polly-parallel \
; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
; RUN: -polly-scheduling=static \
; RUN: -S -verify-dom-info < %s \
; RUN: | FileCheck %s -check-prefix=LIBOMP-STATIC-IR

; Ensure the scalars are initialized before the OpenMP code is launched.
;
; #define N 1024
; float A[N];
;
; void single_parallel_loop(float alpha) {
; for (long i = 0; i < N; i++)
; A[i] = alpha;
; }

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"

; Ensure the scalars are initialized before the OpenMP code is launched.
;

; IR-LABEL: polly.start:
; IR-NEXT: store float %alpha, float* %alpha.s2a

; IR: GOMP_parallel_loop_runtime_start

; LIBOMP-IR-LABEL: polly.start:
; LIBOMP-IR-NEXT: store float %alpha, float* %alpha.s2a

; LIBOMP-IR: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call
; LIBOMP-IR: call void @__kmpc_dispatch_init_{{[4|8]}}

; LIBOMP-STATIC-IR: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call
; LIBOMP-STATIC-IR: call void @__kmpc_for_static_init_{{[4|8]}}

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"

@A = common global [1024 x float] zeroinitializer, align 16

define void @single_parallel_loop(float %alpha) nounwind {
Expand Down
25 changes: 18 additions & 7 deletions polly/test/Isl/CodeGen/openmp_limit_threads.ll
Original file line number Diff line number Diff line change
@@ -1,20 +1,31 @@
; RUN: opt %loadPolly -polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR

; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR

; Ensure that the provided thread numbers are forwarded to the OpenMP calls.
;
; AUTO: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @jd_polly_subfn, i8* %polly.par.userContext{{[0-9]*}}, i32 0, i64 0, i64 1024, i64 1)
; ONE: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @jd_polly_subfn, i8* %polly.par.userContext{{[0-9]*}}, i32 1, i64 0, i64 1024, i64 1)
; FOUR: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @jd_polly_subfn, i8* %polly.par.userContext{{[0-9]*}}, i32 4, i64 0, i64 1024, i64 1)
;
; void jd(int *A) {
; void storePosition(int *A) {
; for (int i = 0; i < 1024; i++)
; for (int j = 0; j < 1024; j++)
; A[i + j * 1024] = 0;
; }
;

; AUTO: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @storePosition_polly_subfn, i8* %polly.par.userContext{{[0-9]*}}, i32 0, i64 0, i64 1024, i64 1)
; ONE: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @storePosition_polly_subfn, i8* %polly.par.userContext{{[0-9]*}}, i32 1, i64 0, i64 1024, i64 1)
; FOUR: call void @GOMP_parallel_loop_runtime_start(void (i8*)* @storePosition_polly_subfn, i8* %polly.par.userContext{{[0-9]*}}, i32 4, i64 0, i64 1024, i64 1)

; In automatic mode, no threads are pushed explicitly.
; LIBOMP-AUTO-NOT: call void @__kmpc_push_num_threads
; LIBOMP-ONE: call void @__kmpc_push_num_threads(%struct.ident_t* @.loc.dummy{{[.0-9]*}}, i32 %{{[0-9]+}}, i32 1)
; LIBOMP-FOUR: call void @__kmpc_push_num_threads(%struct.ident_t* @.loc.dummy{{[.0-9]*}}, i32 %{{[0-9]+}}, i32 4)

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

define void @jd(i32* %A) {
define void @storePosition(i32* %A) {
entry:
br label %for.cond

Expand Down