Skip to content

Commit

Permalink
[Flang] Move genMinMaxlocReductionLoop to a common location.
Browse files Browse the repository at this point in the history
The shared library build doesn't like references of genMinMaxlocReductionLoop,
in Optimizer/Transforms, from HLFIR/Optimizer/Transforms. For the moment I've
moved the code to the header file where it can be shared, like other methods in
Utils.h
  • Loading branch information
davemgreen committed Jan 25, 2024
1 parent b0b7be2 commit 202917f
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 135 deletions.
135 changes: 128 additions & 7 deletions flang/include/flang/Optimizer/Support/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
#include "flang/Optimizer/Support/FatalError.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
Expand Down Expand Up @@ -144,13 +145,133 @@ using AddrGeneratorTy = llvm::function_ref<mlir::Value(
mlir::Value)>;

// Produces a loop nest for a Minloc intrinsic.
void genMinMaxlocReductionLoop(fir::FirOpBuilder &builder, mlir::Value array,
InitValGeneratorTy initVal,
MinlocBodyOpGeneratorTy genBody,
fir::AddrGeneratorTy getAddrFn, unsigned rank,
mlir::Type elementType, mlir::Location loc,
mlir::Type maskElemType, mlir::Value resultArr,
bool maskMayBeLogicalScalar);
inline void genMinMaxlocReductionLoop(
fir::FirOpBuilder &builder, mlir::Value array,
fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody,
fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType,
mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr,
bool maskMayBeLogicalScalar) {
mlir::IndexType idxTy = builder.getIndexType();

mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);

fir::SequenceType::Shape flatShape(rank,
fir::SequenceType::getUnknownExtent());
mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
mlir::Type boxArrTy = fir::BoxType::get(arrTy);
array = builder.create<fir::ConvertOp>(loc, boxArrTy, array);

mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType());
mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0);
mlir::Value flagRef = builder.createTemporary(loc, resultElemType);
builder.create<fir::StoreOp>(loc, zero, flagRef);

mlir::Value init = initVal(builder, loc, elementType);
llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds;

assert(rank > 0 && "rank cannot be zero");
mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);

// Compute all the upper bounds before the loop nest.
// It is not strictly necessary for performance, since the loop nest
// does not have any store operations and any LICM optimization
// should be able to optimize the redundancy.
for (unsigned i = 0; i < rank; ++i) {
mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
auto dims =
builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
mlir::Value len = dims.getResult(1);
// We use C indexing here, so len-1 as loopcount
mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
bounds.push_back(loopCount);
}
// Create a loop nest consisting of OP operations.
// Collect the loops' induction variables into indices array,
// which will be used in the innermost loop to load the input
// array's element.
// The loops are generated such that the innermost loop processes
// the 0 dimension.
llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
for (unsigned i = rank; 0 < i; --i) {
mlir::Value step = one;
mlir::Value loopCount = bounds[i - 1];
auto loop =
builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false,
/*finalCountValue=*/false, init);
init = loop.getRegionIterArgs()[0];
indices.push_back(loop.getInductionVar());
// Set insertion point to the loop body so that the next loop
// is inserted inside the current one.
builder.setInsertionPointToStart(loop.getBody());
}

// Reverse the indices such that they are ordered as:
// <dim-0-idx, dim-1-idx, ...>
std::reverse(indices.begin(), indices.end());
mlir::Value reductionVal =
genBody(builder, loc, elementType, array, flagRef, init, indices);

// Unwind the loop nest and insert ResultOp on each level
// to return the updated value of the reduction to the enclosing
// loops.
for (unsigned i = 0; i < rank; ++i) {
auto result = builder.create<fir::ResultOp>(loc, reductionVal);
// Proceed to the outer loop.
auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp());
reductionVal = loop.getResult(0);
// Set insertion point after the loop operation that we have
// just processed.
builder.setInsertionPointAfter(loop.getOperation());
}
// End of loop nest. The insertion point is after the outermost loop.
if (maskMayBeLogicalScalar) {
if (fir::IfOp ifOp =
mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) {
builder.create<fir::ResultOp>(loc, reductionVal);
builder.setInsertionPointAfter(ifOp);
// Redefine flagSet to escape scope of ifOp
flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
reductionVal = ifOp.getResult(0);
}
}

// Check for case where array was full of max values.
// flag will be 0 if mask was never true, 1 if mask was true as some point,
// this is needed to avoid catching cases where we didn't access any elements
// e.g. mask=.FALSE.
mlir::Value flagValue =
builder.create<fir::LoadOp>(loc, resultElemType, flagRef);
mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>(
loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet);
fir::IfOp ifMaskTrueOp =
builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false);
builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front());

mlir::Value testInit = initVal(builder, loc, elementType);
fir::IfOp ifMinSetOp;
if (elementType.isa<mlir::FloatType>()) {
mlir::Value cmp = builder.create<mlir::arith::CmpFOp>(
loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal);
ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
/*withElseRegion*/ false);
} else {
mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal);
ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
/*withElseRegion*/ false);
}
builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front());

// Load output array with 1s instead of 0s
for (unsigned int i = 0; i < rank; ++i) {
mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
mlir::Value resultElemAddr =
getAddrFn(builder, loc, resultElemType, resultArr, index);
builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr);
}
builder.setInsertionPointAfter(ifMaskTrueOp);
}

} // namespace fir

Expand Down
128 changes: 0 additions & 128 deletions flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,134 +353,6 @@ genReductionLoop(fir::FirOpBuilder &builder, mlir::func::FuncOp &funcOp,
builder.create<mlir::func::ReturnOp>(loc, results[resultIndex]);
}

void fir::genMinMaxlocReductionLoop(
fir::FirOpBuilder &builder, mlir::Value array,
fir::InitValGeneratorTy initVal, fir::MinlocBodyOpGeneratorTy genBody,
fir::AddrGeneratorTy getAddrFn, unsigned rank, mlir::Type elementType,
mlir::Location loc, mlir::Type maskElemType, mlir::Value resultArr,
bool maskMayBeLogicalScalar) {
mlir::IndexType idxTy = builder.getIndexType();

mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);

fir::SequenceType::Shape flatShape(rank,
fir::SequenceType::getUnknownExtent());
mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType);
mlir::Type boxArrTy = fir::BoxType::get(arrTy);
array = builder.create<fir::ConvertOp>(loc, boxArrTy, array);

mlir::Type resultElemType = hlfir::getFortranElementType(resultArr.getType());
mlir::Value flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
mlir::Value zero = builder.createIntegerConstant(loc, resultElemType, 0);
mlir::Value flagRef = builder.createTemporary(loc, resultElemType);
builder.create<fir::StoreOp>(loc, zero, flagRef);

mlir::Value init = initVal(builder, loc, elementType);
llvm::SmallVector<mlir::Value, Fortran::common::maxRank> bounds;

assert(rank > 0 && "rank cannot be zero");
mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);

// Compute all the upper bounds before the loop nest.
// It is not strictly necessary for performance, since the loop nest
// does not have any store operations and any LICM optimization
// should be able to optimize the redundancy.
for (unsigned i = 0; i < rank; ++i) {
mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
auto dims =
builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, dimIdx);
mlir::Value len = dims.getResult(1);
// We use C indexing here, so len-1 as loopcount
mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
bounds.push_back(loopCount);
}
// Create a loop nest consisting of OP operations.
// Collect the loops' induction variables into indices array,
// which will be used in the innermost loop to load the input
// array's element.
// The loops are generated such that the innermost loop processes
// the 0 dimension.
llvm::SmallVector<mlir::Value, Fortran::common::maxRank> indices;
for (unsigned i = rank; 0 < i; --i) {
mlir::Value step = one;
mlir::Value loopCount = bounds[i - 1];
auto loop =
builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step, false,
/*finalCountValue=*/false, init);
init = loop.getRegionIterArgs()[0];
indices.push_back(loop.getInductionVar());
// Set insertion point to the loop body so that the next loop
// is inserted inside the current one.
builder.setInsertionPointToStart(loop.getBody());
}

// Reverse the indices such that they are ordered as:
// <dim-0-idx, dim-1-idx, ...>
std::reverse(indices.begin(), indices.end());
mlir::Value reductionVal =
genBody(builder, loc, elementType, array, flagRef, init, indices);

// Unwind the loop nest and insert ResultOp on each level
// to return the updated value of the reduction to the enclosing
// loops.
for (unsigned i = 0; i < rank; ++i) {
auto result = builder.create<fir::ResultOp>(loc, reductionVal);
// Proceed to the outer loop.
auto loop = mlir::cast<fir::DoLoopOp>(result->getParentOp());
reductionVal = loop.getResult(0);
// Set insertion point after the loop operation that we have
// just processed.
builder.setInsertionPointAfter(loop.getOperation());
}
// End of loop nest. The insertion point is after the outermost loop.
if (maskMayBeLogicalScalar) {
if (fir::IfOp ifOp =
mlir::dyn_cast<fir::IfOp>(builder.getBlock()->getParentOp())) {
builder.create<fir::ResultOp>(loc, reductionVal);
builder.setInsertionPointAfter(ifOp);
// Redefine flagSet to escape scope of ifOp
flagSet = builder.createIntegerConstant(loc, resultElemType, 1);
reductionVal = ifOp.getResult(0);
}
}

// Check for case where array was full of max values.
// flag will be 0 if mask was never true, 1 if mask was true as some point,
// this is needed to avoid catching cases where we didn't access any elements
// e.g. mask=.FALSE.
mlir::Value flagValue =
builder.create<fir::LoadOp>(loc, resultElemType, flagRef);
mlir::Value flagCmp = builder.create<mlir::arith::CmpIOp>(
loc, mlir::arith::CmpIPredicate::eq, flagValue, flagSet);
fir::IfOp ifMaskTrueOp =
builder.create<fir::IfOp>(loc, flagCmp, /*withElseRegion=*/false);
builder.setInsertionPointToStart(&ifMaskTrueOp.getThenRegion().front());

mlir::Value testInit = initVal(builder, loc, elementType);
fir::IfOp ifMinSetOp;
if (elementType.isa<mlir::FloatType>()) {
mlir::Value cmp = builder.create<mlir::arith::CmpFOp>(
loc, mlir::arith::CmpFPredicate::OEQ, testInit, reductionVal);
ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
/*withElseRegion*/ false);
} else {
mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
loc, mlir::arith::CmpIPredicate::eq, testInit, reductionVal);
ifMinSetOp = builder.create<fir::IfOp>(loc, cmp,
/*withElseRegion*/ false);
}
builder.setInsertionPointToStart(&ifMinSetOp.getThenRegion().front());

// Load output array with 1s instead of 0s
for (unsigned int i = 0; i < rank; ++i) {
mlir::Value index = builder.createIntegerConstant(loc, idxTy, i);
mlir::Value resultElemAddr =
getAddrFn(builder, loc, resultElemType, resultArr, index);
builder.create<fir::StoreOp>(loc, flagSet, resultElemAddr);
}
builder.setInsertionPointAfter(ifMaskTrueOp);
}

static llvm::SmallVector<mlir::Value> nopLoopCond(fir::FirOpBuilder &builder,
mlir::Location loc,
mlir::Value reductionVal) {
Expand Down

3 comments on commit 202917f

@vzakhari
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@clementval had to do 2572f45 after this commit. It does not look right that FIRDialect component depends on the HLFIR dialect generated files. I guess it is not that broken as it could be if we had to make FIRDialect component dependent on HLFIRDialect component, but it still looks awkward. Can we extract this utilities into a separate component somewhere in lib/Optimizer/Transforms/Utils, and set up the dependencies on FIR and HLFIR there?

@davemgreen
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hello. Yeah I can try and work on that. I didn't find a great place for it when I was looking, it might need somewhere new. I'll see what I can figure out.

@vzakhari
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you!

Please sign in to comment.