Skip to content

Commit

Permalink
[FLANG] Support all arrays for LoopVersioning
Browse files Browse the repository at this point in the history
This patch makes more than 2D arrays work, with a fix for the way that
loop index is calculated. Removing the restriction of number of
dimensions.

This also changes the way that the actual index is calculated, such that
the stride is used rather than the extent of the previous dimension. Some
tests failed without fixing this - this was likely a latent bug in the
2D version too, but found in a test using 3D arrays, so wouldn't
have been found with 2D only. This introduces a division on the index
calculation - however it should be a nice and constant value allowing
a shift to be used to actually divide - or otherwise removed by using
other methods to calculate the result. In analysing code generated with
optimisation at -O3, there are no divides produced.

Some minor refactoring to avoid repeatedly asking for the "rank" of the
array being worked on.

This improves some of the SPEC-2017 ROMS code, in the same way as the
limited 2D array improvements - less overhead spent calculating array
indices in the inner-most loop and better use of vector-instructions.

Reviewed By: kiranchandramohan

Differential Revision: https://reviews.llvm.org/D151140
  • Loading branch information
Leporacanthicus committed May 30, 2023
1 parent b07d08b commit b75f9ce
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 37 deletions.
69 changes: 40 additions & 29 deletions flang/lib/Optimizer/Transforms/LoopVersioning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ namespace {

class LoopVersioningPass
: public fir::impl::LoopVersioningBase<LoopVersioningPass> {

public:
void runOnOperation() override;
};
Expand Down Expand Up @@ -105,6 +104,7 @@ void LoopVersioningPass::runOnOperation() {
struct ArgInfo {
mlir::Value *arg;
size_t size;
unsigned rank;
fir::BoxDimsOp dims[CFI_MAX_RANK];
};

Expand All @@ -114,13 +114,11 @@ void LoopVersioningPass::runOnOperation() {
mlir::Block::BlockArgListType args = func.getArguments();
mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
fir::KindMapping kindMap = fir::getKindMapping(module);
mlir::SmallVector<ArgInfo> argsOfInterest;
mlir::SmallVector<ArgInfo, 4> argsOfInterest;
for (auto &arg : args) {
if (auto seqTy = getAsSequenceType(&arg)) {
unsigned rank = seqTy.getDimension();
// Currently limited to 1D or 2D arrays as that seems to give good
// improvement without excessive increase in code-size, etc.
if (rank > 0 && rank < 3 &&
if (rank > 0 &&
seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) {
size_t typeSize = 0;
mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType());
Expand All @@ -130,12 +128,9 @@ void LoopVersioningPass::runOnOperation() {
else if (auto cty = elementType.dyn_cast<fir::ComplexType>())
typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8;
if (typeSize)
argsOfInterest.push_back({&arg, typeSize, {}});
argsOfInterest.push_back({&arg, typeSize, rank, {}});
else
LLVM_DEBUG(llvm::dbgs() << "Type not supported\n");

} else {
LLVM_DEBUG(llvm::dbgs() << "Too many dimensions\n");
}
}
}
Expand All @@ -145,14 +140,14 @@ void LoopVersioningPass::runOnOperation() {

struct OpsWithArgs {
mlir::Operation *op;
mlir::SmallVector<ArgInfo> argsAndDims;
mlir::SmallVector<ArgInfo, 4> argsAndDims;
};
// Now see if those arguments are used inside any loop.
mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;

func.walk([&](fir::DoLoopOp loop) {
mlir::Block &body = *loop.getBody();
mlir::SmallVector<ArgInfo> argsInLoop;
mlir::SmallVector<ArgInfo, 4> argsInLoop;
body.walk([&](fir::CoordinateOp op) {
// The current operation could be inside another loop than
// the one we're currently processing. Skip it, we'll get
Expand Down Expand Up @@ -199,16 +194,16 @@ void LoopVersioningPass::runOnOperation() {
mlir::Value allCompares = nullptr;
// Ensure all of the arrays are unit-stride.
for (auto &arg : op.argsAndDims) {

fir::SequenceType seqTy = getAsSequenceType(arg.arg);
unsigned rank = seqTy.getDimension();

// We only care about lowest order dimension.
for (unsigned i = 0; i < rank; i++) {
// Fetch all the dimensions of the array, except the last dimension.
// Always fetch the first dimension, however, so set ndims = 1 if
// we have one dim
unsigned ndims = arg.rank;
for (unsigned i = 0; i < ndims; i++) {
mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
*arg.arg, dimIdx);
}
// We only care about lowest order dimension, here.
mlir::Value elemSize =
builder.createIntegerConstant(loc, idxTy, arg.size);
mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
Expand Down Expand Up @@ -245,25 +240,41 @@ void LoopVersioningPass::runOnOperation() {
// Reduce the multi-dimensioned index to a single index.
// This is required becase fir arrays do not support multiple dimensions
// with unknown dimensions at compile time.
// We then calculate the multidimensional array like this:
// arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x)
// where stride is the distance between elements in the dimensions
// 0, 1 and 2 or x, y and z.
if (coop->getOperand(0) == *arg.arg &&
coop->getOperands().size() >= 2) {
builder.setInsertionPoint(coop);
mlir::Value totalIndex = builder.createIntegerConstant(loc, idxTy, 0);
// Operand(1) = array; Operand(2) = index1; Operand(3) = index2
for (unsigned i = coop->getOperands().size() - 1; i > 1; i--) {
mlir::Value totalIndex;
for (unsigned i = arg.rank - 1; i > 0; i--) {
// Operand(1) = array; Operand(2) = index1; Operand(3) = index2
mlir::Value curIndex =
builder.createConvert(loc, idxTy, coop->getOperand(i));
// First arg is Operand2, so dims[i-2] is 0-based i-1!
builder.createConvert(loc, idxTy, coop->getOperand(i + 1));
// Multiply by the stride of this array. Later we'll divide by the
// element size.
mlir::Value scale =
builder.createConvert(loc, idxTy, arg.dims[i - 2].getResult(1));
builder.createConvert(loc, idxTy, arg.dims[i].getResult(2));
curIndex =
builder.create<mlir::arith::MulIOp>(loc, scale, curIndex);
totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>(
loc, curIndex, totalIndex)
: curIndex;
}
mlir::Value elemSize =
builder.createIntegerConstant(loc, idxTy, arg.size);
// This is the lowest dimension - which doesn't need scaling
mlir::Value finalIndex =
builder.createConvert(loc, idxTy, coop->getOperand(1));
if (totalIndex) {
totalIndex = builder.create<mlir::arith::AddIOp>(
loc, totalIndex,
builder.create<mlir::arith::MulIOp>(loc, scale, curIndex));
loc,
builder.create<mlir::arith::DivSIOp>(loc, totalIndex, elemSize),
finalIndex);
} else {
totalIndex = finalIndex;
}
totalIndex = builder.create<mlir::arith::AddIOp>(
loc, totalIndex,
builder.createConvert(loc, idxTy, coop->getOperand(1)));

auto newOp = builder.create<fir::CoordinateOp>(
loc, builder.getRefType(elementType), caddr,
mlir::ValueRange{totalIndex});
Expand Down
150 changes: 142 additions & 8 deletions flang/test/Transforms/loop-versioning.fir
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,7 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
// CHECK: %[[CONV:.*]] = fir.convert %[[Y]] : {{.*}}
// CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]] : {{.*}}
// CHECK: fir.do_loop %[[INDEX:.*]] = {{.*}}
// CHECK: %[[IND_PLUS_1:.*]] = arith.addi %{{.*}}, %[[INDEX]]
// CHECK: %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[IND_PLUS_1]]
// CHECK: %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[INDEX]]
// CHECK: %[[YINT:.*]] = fir.load %[[YADDR]] : {{.*}}
// CHECK: %[[YINDEX:.*]] = fir.convert %[[YINT]]
// CHECK: %[[XADDR:.*]] = fir.array_coor %[[X]] [%{{.*}}] %[[YINDEX]]
Expand Down Expand Up @@ -269,7 +268,7 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
// CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]]
// CHECK: %[[RES:.*]] = fir.do_loop {{.*}} {
// CHECK: %[[ADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %{{.*}}
// CHECK: %45 = fir.load %[[ADDR]] : !fir.ref<f32>
// CHECK: %{{.*}} = fir.load %[[ADDR]] : !fir.ref<f32>
// CHECK: }
// CHECK: fir.result %[[RES]] : {{.*}}
// CHECK: } else {
Expand Down Expand Up @@ -355,19 +354,22 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
// Only inner loop should be verisoned.
// CHECK: fir.do_loop
// CHECK: %[[ZERO:.*]] = arith.constant 0 : index
// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
// CHECK: %[[ONE:.*]] = arith.constant 1 : index
// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}}
// CHECK: %[[SIZE:.*]] = arith.constant 8 : index
// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS]]#2, %[[SIZE]]
// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]]
// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}}
// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]]
// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref<!fir.array<?xf64>>
// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}}
// Check the 2D -> 1D coordinate conversion, should have a multiply and a final add.
// Some other operations are checked to synch the different parts.
// CHECK: arith.muli %[[DIMS]]#1, {{.*}}
// CHECK: %[[OUTER_IDX:.*]] = arith.addi {{.*}}
// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}}
// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index
// CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}}
// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_IDX]], %[[INNER_IDX]]
// CHECK: %[[OUTER_DIV:.*]] = arith.divsi %[[OUTER_IDX]], %[[ITEMSIZE]]
// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_DIV]], %[[INNER_IDX]]
// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C2D]] : (!fir.ref<!fir.array<?xf64>>, index) -> !fir.ref<f64>
// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref<f64>
// CHECK: fir.result %{{.*}}, %{{.*}}
Expand All @@ -384,4 +386,136 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}}
// CHECK: return

// -----

// subroutine sum3d(a, nx, ny, nz)
// real*8 :: a(:, :, :)
// integer :: nx, ny, nz
// real*8 :: sum
// integer :: i, j, k
// sum = 0
// do k=1,nz
// do j=1,ny
// do i=0,nx
// sum = sum + a(i, j, k)
// end do
// end do
// end do
// end subroutine sum3d


func.func @sum3d(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "nx"}, %arg2: !fir.ref<i32> {fir.bindc_name = "ny"}, %arg3: !fir.ref<i32> {fir.bindc_name = "nz"}) {
%0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmoduleFsum3dEi"}
%1 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmoduleFsum3dEj"}
%2 = fir.alloca i32 {bindc_name = "k", uniq_name = "_QMmoduleFsum3dEk"}
%3 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QMmoduleFsum3dEsum"}
%cst = arith.constant 0.000000e+00 : f64
fir.store %cst to %3 : !fir.ref<f64>
%c1_i32 = arith.constant 1 : i32
%4 = fir.convert %c1_i32 : (i32) -> index
%5 = fir.load %arg3 : !fir.ref<i32>
%6 = fir.convert %5 : (i32) -> index
%c1 = arith.constant 1 : index
%7 = fir.convert %4 : (index) -> i32
%8:2 = fir.do_loop %arg4 = %4 to %6 step %c1 iter_args(%arg5 = %7) -> (index, i32) {
fir.store %arg5 to %2 : !fir.ref<i32>
%c1_i32_0 = arith.constant 1 : i32
%9 = fir.convert %c1_i32_0 : (i32) -> index
%10 = fir.load %arg2 : !fir.ref<i32>
%11 = fir.convert %10 : (i32) -> index
%c1_1 = arith.constant 1 : index
%12 = fir.convert %9 : (index) -> i32
%13:2 = fir.do_loop %arg6 = %9 to %11 step %c1_1 iter_args(%arg7 = %12) -> (index, i32) {
fir.store %arg7 to %1 : !fir.ref<i32>
%c0_i32 = arith.constant 0 : i32
%18 = fir.convert %c0_i32 : (i32) -> index
%19 = fir.load %arg1 : !fir.ref<i32>
%20 = fir.convert %19 : (i32) -> index
%c1_2 = arith.constant 1 : index
%21 = fir.convert %18 : (index) -> i32
%22:2 = fir.do_loop %arg8 = %18 to %20 step %c1_2 iter_args(%arg9 = %21) -> (index, i32) {
fir.store %arg9 to %0 : !fir.ref<i32>
%27 = fir.load %3 : !fir.ref<f64>
%28 = fir.load %0 : !fir.ref<i32>
%29 = fir.convert %28 : (i32) -> i64
%c1_i64 = arith.constant 1 : i64
%30 = arith.subi %29, %c1_i64 : i64
%31 = fir.load %1 : !fir.ref<i32>
%32 = fir.convert %31 : (i32) -> i64
%c1_i64_3 = arith.constant 1 : i64
%33 = arith.subi %32, %c1_i64_3 : i64
%34 = fir.load %2 : !fir.ref<i32>
%35 = fir.convert %34 : (i32) -> i64
%c1_i64_4 = arith.constant 1 : i64
%36 = arith.subi %35, %c1_i64_4 : i64
%37 = fir.coordinate_of %arg0, %30, %33, %36 : (!fir.box<!fir.array<?x?x?xf64>>, i64, i64, i64) -> !fir.ref<f64>
%38 = fir.load %37 : !fir.ref<f64>
%39 = arith.addf %27, %38 fastmath<contract> : f64
fir.store %39 to %3 : !fir.ref<f64>
%40 = arith.addi %arg8, %c1_2 : index
%41 = fir.convert %c1_2 : (index) -> i32
%42 = fir.load %0 : !fir.ref<i32>
%43 = arith.addi %42, %41 : i32
fir.result %40, %43 : index, i32
}
fir.store %22#1 to %0 : !fir.ref<i32>
%23 = arith.addi %arg6, %c1_1 : index
%24 = fir.convert %c1_1 : (index) -> i32
%25 = fir.load %1 : !fir.ref<i32>
%26 = arith.addi %25, %24 : i32
fir.result %23, %26 : index, i32
}
fir.store %13#1 to %1 : !fir.ref<i32>
%14 = arith.addi %arg4, %c1 : index
%15 = fir.convert %c1 : (index) -> i32
%16 = fir.load %2 : !fir.ref<i32>
%17 = arith.addi %16, %15 : i32
fir.result %14, %17 : index, i32
}
fir.store %8#1 to %2 : !fir.ref<i32>
return
}

// Note this only checks the expected transformation, not the entire generated code:
// CHECK-LABEL: func.func @sum3d(
// CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {{.*}})
// Only inner loop should be verisoned.
// CHECK: fir.do_loop
// CHECK: %[[ZERO:.*]] = arith.constant 0 : index
// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
// CHECK: %[[ONE:.*]] = arith.constant 1 : index
// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}}
// CHECK: %[[TWO:.*]] = arith.constant 2 : index
// CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[ARG0]], %[[TWO]] : {{.*}}
// CHECK: %[[SIZE:.*]] = arith.constant 8 : index
// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]]
// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}}
// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]]
// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref<!fir.array<?xf64>>
// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}}
// Check the 3D -> 1D coordinate conversion, should have a multiply and a final add.
// Some other operations are checked to synch the different parts.
// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS2]]#2, {{.*}}
// CHECK: %[[MIDDLE_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}}
// CHECK: %[[MIDDLE_SUM:.*]] = arith.addi %[[MIDDLE_IDX]], %[[OUTER_IDX]]
// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index
// CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}}
// CHECK: %[[MIDDLE_DIV:.*]] = arith.divsi %[[MIDDLE_SUM]], %[[ITEMSIZE]]
// CHECK: %[[C3D:.*]] = arith.addi %[[MIDDLE_DIV]], %[[INNER_IDX]]
// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C3D]] : (!fir.ref<!fir.array<?xf64>>, index) -> !fir.ref<f64>
// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref<f64>
// CHECK: fir.result %{{.*}}, %{{.*}}
// CHECK: }
// CHECK fir.result %[[LOOP_RES]]#0, %[[LOOP_RES]]#1
// CHECK: } else {
// CHECK: %[[LOOP_RES2:.*]]:2 = fir.do_loop {{.*}}
// CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.box<!fir.array<?x?x?xf64>>, i64, i64, i64) -> !fir.ref<f64>
// CHECK: %{{.*}}= fir.load %[[COORD2]] : !fir.ref<f64>
// CHECK: fir.result %{{.*}}, %{{.*}}
// CHECK: }
// CHECK fir.result %[[LOOP_RES2]]#0, %[[LOOP_RES2]]#1
// CHECK: }
// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}}
// CHECK: return

} // End module

0 comments on commit b75f9ce

Please sign in to comment.