414 changes: 414 additions & 0 deletions flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -1339,10 +1339,12 @@ class OpenMPIRBuilder {
/// in reductions.
/// \param ReductionInfos A list of info on each reduction variable.
/// \param IsNoWait A flag set if the reduction is marked as nowait.
/// \param IsByRef A flag set if the reduction is using reference
/// or direct value.
InsertPointTy createReductions(const LocationDescription &Loc,
InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos,
bool IsNoWait = false);
bool IsNoWait = false, bool IsByRef = false);

///}

Expand Down
30 changes: 22 additions & 8 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2110,7 +2110,7 @@ Function *getFreshReductionFunc(Module &M) {

OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
const LocationDescription &Loc, InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
for (const ReductionInfo &RI : ReductionInfos) {
(void)RI;
assert(RI.Variable && "expected non-null variable");
Expand Down Expand Up @@ -2197,17 +2197,29 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
for (auto En : enumerate(ReductionInfos)) {
const ReductionInfo &RI = En.value();
Type *ValueType = RI.ElementType;
Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
"red.value." + Twine(En.index()));
// We have one less load for by-ref case because that load is now inside of
// the reduction region
Value *RedValue = nullptr;
if (!IsByRef) {
RedValue = Builder.CreateLoad(ValueType, RI.Variable,
"red.value." + Twine(En.index()));
}
Value *PrivateRedValue =
Builder.CreateLoad(ValueType, RI.PrivateVariable,
"red.private.value." + Twine(En.index()));
Value *Reduced;
Builder.restoreIP(
RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced));
if (IsByRef) {
Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RI.Variable,
PrivateRedValue, Reduced));
} else {
Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RedValue,
PrivateRedValue, Reduced));
}
if (!Builder.GetInsertBlock())
return InsertPointTy();
Builder.CreateStore(Reduced, RI.Variable);
// for by-ref case, the load is inside of the reduction region
if (!IsByRef)
Builder.CreateStore(Reduced, RI.Variable);
}
Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
Expand All @@ -2219,7 +2231,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
// function. There are no loads/stores here because they will be happening
// inside the atomic elementwise reduction.
Builder.SetInsertPoint(AtomicRedBlock);
if (CanGenerateAtomic) {
if (CanGenerateAtomic && !IsByRef) {
for (const ReductionInfo &RI : ReductionInfos) {
Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType,
RI.Variable, RI.PrivateVariable));
Expand Down Expand Up @@ -2257,7 +2269,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
if (!Builder.GetInsertBlock())
return InsertPointTy();
Builder.CreateStore(Reduced, LHSPtr);
// store is inside of the reduction region when using by-ref
if (!IsByRef)
Builder.CreateStore(Reduced, LHSPtr);
}
Builder.CreateRetVoid();

Expand Down
12 changes: 11 additions & 1 deletion mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,9 @@ def ParallelOp : OpenMP_Op<"parallel", [

The optional $proc_bind_val attribute controls the thread affinity for the execution
of the parallel region.

The optional byref attribute controls whether reduction arguments are passed by
reference or by value.
}];

let arguments = (ins Optional<I1>:$if_expr_var,
Expand All @@ -278,7 +281,8 @@ def ParallelOp : OpenMP_Op<"parallel", [
OptionalAttr<SymbolRefArrayAttr>:$reductions,
OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
Variadic<AnyType>:$private_vars,
OptionalAttr<SymbolRefArrayAttr>:$privatizers);
OptionalAttr<SymbolRefArrayAttr>:$privatizers,
UnitAttr:$byref);

let regions = (region AnyRegion:$region);

Expand All @@ -299,6 +303,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
$allocators_vars, type($allocators_vars)
) `)`
| `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
| `byref` $byref
) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
$reductions, $private_vars, type($private_vars),
$privatizers) attr-dict
Expand Down Expand Up @@ -570,6 +575,9 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
The optional `order` attribute specifies which order the iterations of the
associate loops are executed in. Currently the only option for this
attribute is "concurrent".

The optional `byref` attribute indicates that reduction arguments should be
passed by reference.
}];

let arguments = (ins Variadic<IntLikeType>:$lowerBound,
Expand All @@ -584,6 +592,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
UnitAttr:$simd_modifier,
UnitAttr:$nowait,
UnitAttr:$byref,
ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val,
OptionalAttr<OrderKindAttr>:$order_val,
UnitAttr:$inclusive);
Expand Down Expand Up @@ -613,6 +622,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
$schedule_val, $schedule_modifier, $simd_modifier,
$schedule_chunk_var, type($schedule_chunk_var)) `)`
|`nowait` $nowait
|`byref` $byref
|`ordered` `(` $ordered_val `)`
|`order` `(` custom<ClauseAttr>($order_val) `)`
) custom<WsLoop>($region, $lowerBound, $upperBound, $step,
Expand Down
5 changes: 3 additions & 2 deletions mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1209,7 +1209,7 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
/*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
/*reduction_vars=*/ValueRange(), /*reductions=*/nullptr,
/*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(),
/*privatizers=*/nullptr);
/*privatizers=*/nullptr, /*byref=*/false);
state.addAttributes(attributes);
}

Expand Down Expand Up @@ -1674,7 +1674,8 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &state,
/*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(),
/*reductions=*/nullptr, /*schedule_val=*/nullptr,
/*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
/*simd_modifier=*/false, /*nowait=*/false, /*ordered_val=*/nullptr,
/*simd_modifier=*/false, /*nowait=*/false, /*byref=*/false,
/*ordered_val=*/nullptr,
/*order_val=*/nullptr, /*inclusive=*/false);
state.addAttributes(attributes);
}
Expand Down
99 changes: 72 additions & 27 deletions mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -805,12 +805,12 @@ convertOmpTaskgroupOp(omp::TaskGroupOp tgOp, llvm::IRBuilderBase &builder,
/// Allocate space for privatized reduction variables.
template <typename T>
static void
allocReductionVars(T loop, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
SmallVector<omp::ReductionDeclareOp> &reductionDecls,
SmallVector<llvm::Value *> &privateReductionVariables,
DenseMap<Value, llvm::Value *> &reductionVariableMap) {
allocByValReductionVars(T loop, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
SmallVector<omp::ReductionDeclareOp> &reductionDecls,
SmallVector<llvm::Value *> &privateReductionVariables,
DenseMap<Value, llvm::Value *> &reductionVariableMap) {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.restoreIP(allocaIP);
auto args =
Expand Down Expand Up @@ -863,6 +863,7 @@ static LogicalResult
convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) {
auto loop = cast<omp::WsLoopOp>(opInst);
const bool isByRef = loop.getByref();
// TODO: this should be in the op verifier instead.
if (loop.getLowerBound().empty())
return failure();
Expand All @@ -888,18 +889,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,

SmallVector<llvm::Value *> privateReductionVariables;
DenseMap<Value, llvm::Value *> reductionVariableMap;
allocReductionVars(loop, builder, moduleTranslation, allocaIP, reductionDecls,
privateReductionVariables, reductionVariableMap);

// Store the mapping between reduction variables and their private copies on
// ModuleTranslation stack. It can be then recovered when translating
// omp.reduce operations in a separate call.
LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
moduleTranslation, reductionVariableMap);
if (!isByRef) {
allocByValReductionVars(loop, builder, moduleTranslation, allocaIP,
reductionDecls, privateReductionVariables,
reductionVariableMap);
}

// Before the loop, store the initial values of reductions into reduction
// variables. Although this could be done after allocas, we don't want to mess
// up with the alloca insertion point.
MutableArrayRef<BlockArgument> reductionArgs =
loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {
SmallVector<llvm::Value *> phis;
if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
Expand All @@ -908,9 +908,31 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
return failure();
assert(phis.size() == 1 && "expected one value to be yielded from the "
"reduction neutral element declaration region");
builder.CreateStore(phis[0], privateReductionVariables[i]);
if (isByRef) {
// Allocate reduction variable (which is a pointer to the real reduction
// variable allocated in the inlined region)
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
// Store the result of the inlined region to the allocated reduction var
// ptr
builder.CreateStore(phis[0], var);

privateReductionVariables.push_back(var);
moduleTranslation.mapValue(reductionArgs[i], phis[0]);
reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]);
} else {
// for by-ref case the store is inside of the reduction region
builder.CreateStore(phis[0], privateReductionVariables[i]);
// the rest was handled in allocByValReductionVars
}
}

// Store the mapping between reduction variables and their private copies on
// ModuleTranslation stack. It can be then recovered when translating
// omp.reduce operations in a separate call.
LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
moduleTranslation, reductionVariableMap);

// Set up the source location value for OpenMP runtime.
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);

Expand Down Expand Up @@ -1014,7 +1036,7 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
builder.SetInsertPoint(tempTerminator);
llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
loop.getNowait());
loop.getNowait(), isByRef);
if (!contInsertPoint.getBlock())
return loop->emitOpError() << "failed to convert reductions";
auto nextInsertionPoint =
Expand Down Expand Up @@ -1068,6 +1090,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) {
using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
OmpParallelOpConversionManager raii(opInst);
const bool isByRef = opInst.getByref();

// TODO: support error propagation in OpenMPIRBuilder and use it instead of
// relying on captured variables.
Expand All @@ -1082,18 +1105,17 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
// Allocate reduction vars
SmallVector<llvm::Value *> privateReductionVariables;
DenseMap<Value, llvm::Value *> reductionVariableMap;
allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
reductionDecls, privateReductionVariables,
reductionVariableMap);

// Store the mapping between reduction variables and their private copies on
// ModuleTranslation stack. It can be then recovered when translating
// omp.reduce operations in a separate call.
LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
moduleTranslation, reductionVariableMap);
if (!isByRef) {
allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
reductionDecls, privateReductionVariables,
reductionVariableMap);
}

// Initialize reduction vars
builder.restoreIP(allocaIP);
MutableArrayRef<BlockArgument> reductionArgs =
opInst.getRegion().getArguments().take_back(
opInst.getNumReductionVars());
for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
SmallVector<llvm::Value *> phis;
if (failed(inlineConvertOmpRegions(
Expand All @@ -1104,9 +1126,32 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
"expected one value to be yielded from the "
"reduction neutral element declaration region");
builder.restoreIP(allocaIP);
builder.CreateStore(phis[0], privateReductionVariables[i]);

if (isByRef) {
// Allocate reduction variable (which is a pointer to the real reduciton
// variable allocated in the inlined region)
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
// Store the result of the inlined region to the allocated reduction var
// ptr
builder.CreateStore(phis[0], var);

privateReductionVariables.push_back(var);
moduleTranslation.mapValue(reductionArgs[i], phis[0]);
reductionVariableMap.try_emplace(opInst.getReductionVars()[i], phis[0]);
} else {
// for by-ref case the store is inside of the reduction init region
builder.CreateStore(phis[0], privateReductionVariables[i]);
// the rest is done in allocByValReductionVars
}
}

// Store the mapping between reduction variables and their private copies on
// ModuleTranslation stack. It can be then recovered when translating
// omp.reduce operations in a separate call.
LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
moduleTranslation, reductionVariableMap);

// Save the alloca insertion point on ModuleTranslation stack for use in
// nested regions.
LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
Expand Down Expand Up @@ -1137,7 +1182,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,

llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
ompBuilder->createReductions(builder.saveIP(), allocaIP,
reductionInfos, false);
reductionInfos, false, isByRef);
if (!contInsertPoint.getBlock()) {
bodyGenStatus = opInst->emitOpError() << "failed to convert reductions";
return;
Expand Down
66 changes: 66 additions & 0 deletions mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s

omp.reduction.declare @add_reduction_i_32 : !llvm.ptr init {
^bb0(%arg0: !llvm.ptr):
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(1 : i64) : i64
%2 = llvm.alloca %1 x i32 : (i64) -> !llvm.ptr
llvm.store %0, %2 : i32, !llvm.ptr
omp.yield(%2 : !llvm.ptr)
} combiner {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%0 = llvm.load %arg0 : !llvm.ptr -> i32
%1 = llvm.load %arg1 : !llvm.ptr -> i32
%2 = llvm.add %0, %1 : i32
llvm.store %2, %arg0 : i32, !llvm.ptr
omp.yield(%arg0 : !llvm.ptr)
}

// CHECK-LABEL: @main
llvm.func @main() {
%0 = llvm.mlir.constant(-1 : i32) : i32
%1 = llvm.mlir.addressof @i : !llvm.ptr
omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr) {
llvm.store %0, %arg0 : i32, !llvm.ptr
omp.terminator
}
llvm.return
}
llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
%0 = llvm.mlir.constant(0 : i32) : i32
llvm.return %0 : i32
}

// CHECK: %{{.+}} =
// Call to the outlined function.
// CHECK: call void {{.*}} @__kmpc_fork_call
// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]

// Outlined function.
// CHECK: define internal void @[[OUTLINED]]

// Private reduction variable and its initialization.
// CHECK: %tid.addr.local = alloca i32
// CHECK: %[[PRIVATE:.+]] = alloca i32
// CHECK: store i32 0, ptr %[[PRIVATE]]
// CHECK: store ptr %[[PRIVATE]], ptr %[[PRIV_PTR:.+]],

// Call to the reduction function.
// CHECK: call i32 @__kmpc_reduce
// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]


// Non-atomic reduction:
// CHECK: %[[PRIV_VAL_PTR:.+]] = load ptr, ptr %[[PRIV_PTR]]
// CHECK: %[[LOAD:.+]] = load i32, ptr @i
// CHECK: %[[PRIV_VAL:.+]] = load i32, ptr %[[PRIV_VAL_PTR]]
// CHECK: %[[SUM:.+]] = add i32 %[[LOAD]], %[[PRIV_VAL]]
// CHECK: store i32 %[[SUM]], ptr @i
// CHECK: call void @__kmpc_end_reduce
// CHECK: br label %[[FINALIZE:.+]]

// CHECK: [[FINALIZE]]:

// Reduction function.
// CHECK: define internal void @[[REDFUNC]]
// CHECK: add i32