From b75f9ce3fe861473e36ea6715d82b6954ea2b815 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Mon, 22 May 2023 15:48:01 +0100 Subject: [PATCH] [FLANG] Support all arrays for LoopVersioning This patch makes more than 2D arrays work, with a fix for the way that loop index is calculated. Removing the restriction of number of dimensions. This also changes the way that the actual index is calculated, such that the stride is used rather than the extent of the previous dimension. Some tests failed without fixing this - this was likely a latent bug in the 2D version too, but found in a test using 3D arrays, so wouldn't have been found with 2D only. This introduces a division on the index calculation - however it should be a nice and constant value allowing a shift to be used to actually divide - or otherwise removed by using other methods to calculate the result. In analysing code generated with optimisation at -O3, there are no divides produced. Some minor refactoring to avoid repeatedly asking for the "rank" of the array being worked on. This improves some of the SPEC-2017 ROMS code, in the same way as the limited 2D array improvements - less overhead spent calculating array indices in the inner-most loop and better use of vector-instructions. Reviewed By: kiranchandramohan Differential Revision: https://reviews.llvm.org/D151140 --- .../Optimizer/Transforms/LoopVersioning.cpp | 69 ++++---- flang/test/Transforms/loop-versioning.fir | 150 +++++++++++++++++- 2 files changed, 182 insertions(+), 37 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp index 4695d39ee41cb..f1588d2f622a1 100644 --- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp +++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp @@ -73,7 +73,6 @@ namespace { class LoopVersioningPass : public fir::impl::LoopVersioningBase { - public: void runOnOperation() override; }; @@ -105,6 +104,7 @@ void LoopVersioningPass::runOnOperation() { struct ArgInfo { mlir::Value *arg; size_t size; + unsigned rank; fir::BoxDimsOp dims[CFI_MAX_RANK]; }; @@ -114,13 +114,11 @@ void LoopVersioningPass::runOnOperation() { mlir::Block::BlockArgListType args = func.getArguments(); mlir::ModuleOp module = func->getParentOfType(); fir::KindMapping kindMap = fir::getKindMapping(module); - mlir::SmallVector argsOfInterest; + mlir::SmallVector argsOfInterest; for (auto &arg : args) { if (auto seqTy = getAsSequenceType(&arg)) { unsigned rank = seqTy.getDimension(); - // Currently limited to 1D or 2D arrays as that seems to give good - // improvement without excessive increase in code-size, etc. - if (rank > 0 && rank < 3 && + if (rank > 0 && seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) { size_t typeSize = 0; mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType()); @@ -130,12 +128,9 @@ void LoopVersioningPass::runOnOperation() { else if (auto cty = elementType.dyn_cast()) typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8; if (typeSize) - argsOfInterest.push_back({&arg, typeSize, {}}); + argsOfInterest.push_back({&arg, typeSize, rank, {}}); else LLVM_DEBUG(llvm::dbgs() << "Type not supported\n"); - - } else { - LLVM_DEBUG(llvm::dbgs() << "Too many dimensions\n"); } } } @@ -145,14 +140,14 @@ void LoopVersioningPass::runOnOperation() { struct OpsWithArgs { mlir::Operation *op; - mlir::SmallVector argsAndDims; + mlir::SmallVector argsAndDims; }; // Now see if those arguments are used inside any loop. mlir::SmallVector loopsOfInterest; func.walk([&](fir::DoLoopOp loop) { mlir::Block &body = *loop.getBody(); - mlir::SmallVector argsInLoop; + mlir::SmallVector argsInLoop; body.walk([&](fir::CoordinateOp op) { // The current operation could be inside another loop than // the one we're currently processing. Skip it, we'll get @@ -199,16 +194,16 @@ void LoopVersioningPass::runOnOperation() { mlir::Value allCompares = nullptr; // Ensure all of the arrays are unit-stride. for (auto &arg : op.argsAndDims) { - - fir::SequenceType seqTy = getAsSequenceType(arg.arg); - unsigned rank = seqTy.getDimension(); - - // We only care about lowest order dimension. - for (unsigned i = 0; i < rank; i++) { + // Fetch all the dimensions of the array, except the last dimension. + // Always fetch the first dimension, however, so set ndims = 1 if + // we have one dim + unsigned ndims = arg.rank; + for (unsigned i = 0; i < ndims; i++) { mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); arg.dims[i] = builder.create(loc, idxTy, idxTy, idxTy, *arg.arg, dimIdx); } + // We only care about lowest order dimension, here. mlir::Value elemSize = builder.createIntegerConstant(loc, idxTy, arg.size); mlir::Value cmp = builder.create( @@ -245,25 +240,41 @@ void LoopVersioningPass::runOnOperation() { // Reduce the multi-dimensioned index to a single index. // This is required becase fir arrays do not support multiple dimensions // with unknown dimensions at compile time. + // We then calculate the multidimensional array like this: + // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x) + // where stride is the distance between elements in the dimensions + // 0, 1 and 2 or x, y and z. if (coop->getOperand(0) == *arg.arg && coop->getOperands().size() >= 2) { builder.setInsertionPoint(coop); - mlir::Value totalIndex = builder.createIntegerConstant(loc, idxTy, 0); - // Operand(1) = array; Operand(2) = index1; Operand(3) = index2 - for (unsigned i = coop->getOperands().size() - 1; i > 1; i--) { + mlir::Value totalIndex; + for (unsigned i = arg.rank - 1; i > 0; i--) { + // Operand(1) = array; Operand(2) = index1; Operand(3) = index2 mlir::Value curIndex = - builder.createConvert(loc, idxTy, coop->getOperand(i)); - // First arg is Operand2, so dims[i-2] is 0-based i-1! + builder.createConvert(loc, idxTy, coop->getOperand(i + 1)); + // Multiply by the stride of this array. Later we'll divide by the + // element size. mlir::Value scale = - builder.createConvert(loc, idxTy, arg.dims[i - 2].getResult(1)); + builder.createConvert(loc, idxTy, arg.dims[i].getResult(2)); + curIndex = + builder.create(loc, scale, curIndex); + totalIndex = (totalIndex) ? builder.create( + loc, curIndex, totalIndex) + : curIndex; + } + mlir::Value elemSize = + builder.createIntegerConstant(loc, idxTy, arg.size); + // This is the lowest dimension - which doesn't need scaling + mlir::Value finalIndex = + builder.createConvert(loc, idxTy, coop->getOperand(1)); + if (totalIndex) { totalIndex = builder.create( - loc, totalIndex, - builder.create(loc, scale, curIndex)); + loc, + builder.create(loc, totalIndex, elemSize), + finalIndex); + } else { + totalIndex = finalIndex; } - totalIndex = builder.create( - loc, totalIndex, - builder.createConvert(loc, idxTy, coop->getOperand(1))); - auto newOp = builder.create( loc, builder.getRefType(elementType), caddr, mlir::ValueRange{totalIndex}); diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir index bad02b7a226c5..3c8930ccbde47 100644 --- a/flang/test/Transforms/loop-versioning.fir +++ b/flang/test/Transforms/loop-versioning.fir @@ -156,8 +156,7 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // CHECK: %[[CONV:.*]] = fir.convert %[[Y]] : {{.*}} // CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]] : {{.*}} // CHECK: fir.do_loop %[[INDEX:.*]] = {{.*}} -// CHECK: %[[IND_PLUS_1:.*]] = arith.addi %{{.*}}, %[[INDEX]] -// CHECK: %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[IND_PLUS_1]] +// CHECK: %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[INDEX]] // CHECK: %[[YINT:.*]] = fir.load %[[YADDR]] : {{.*}} // CHECK: %[[YINDEX:.*]] = fir.convert %[[YINT]] // CHECK: %[[XADDR:.*]] = fir.array_coor %[[X]] [%{{.*}}] %[[YINDEX]] @@ -269,7 +268,7 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]] // CHECK: %[[RES:.*]] = fir.do_loop {{.*}} { // CHECK: %[[ADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %{{.*}} -// CHECK: %45 = fir.load %[[ADDR]] : !fir.ref +// CHECK: %{{.*}} = fir.load %[[ADDR]] : !fir.ref // CHECK: } // CHECK: fir.result %[[RES]] : {{.*}} // CHECK: } else { @@ -355,19 +354,22 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // Only inner loop should be verisoned. // CHECK: fir.do_loop // CHECK: %[[ZERO:.*]] = arith.constant 0 : index -// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}} // CHECK: %[[SIZE:.*]] = arith.constant 8 : index -// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS]]#2, %[[SIZE]] +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]] // CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}} // CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]] // CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref> // CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}} // Check the 2D -> 1D coordinate conversion, should have a multiply and a final add. // Some other operations are checked to synch the different parts. -// CHECK: arith.muli %[[DIMS]]#1, {{.*}} -// CHECK: %[[OUTER_IDX:.*]] = arith.addi {{.*}} +// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}} +// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index // CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}} -// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_IDX]], %[[INNER_IDX]] +// CHECK: %[[OUTER_DIV:.*]] = arith.divsi %[[OUTER_IDX]], %[[ITEMSIZE]] +// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_DIV]], %[[INNER_IDX]] // CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C2D]] : (!fir.ref>, index) -> !fir.ref // CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref // CHECK: fir.result %{{.*}}, %{{.*}} @@ -384,4 +386,136 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // CHECK: fir.store %[[IF_RES]]#1 to %{{.*}} // CHECK: return +// ----- + +// subroutine sum3d(a, nx, ny, nz) +// real*8 :: a(:, :, :) +// integer :: nx, ny, nz +// real*8 :: sum +// integer :: i, j, k +// sum = 0 +// do k=1,nz +// do j=1,ny +// do i=0,nx +// sum = sum + a(i, j, k) +// end do +// end do +// end do +// end subroutine sum3d + + + func.func @sum3d(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "nx"}, %arg2: !fir.ref {fir.bindc_name = "ny"}, %arg3: !fir.ref {fir.bindc_name = "nz"}) { + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmoduleFsum3dEi"} + %1 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmoduleFsum3dEj"} + %2 = fir.alloca i32 {bindc_name = "k", uniq_name = "_QMmoduleFsum3dEk"} + %3 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QMmoduleFsum3dEsum"} + %cst = arith.constant 0.000000e+00 : f64 + fir.store %cst to %3 : !fir.ref + %c1_i32 = arith.constant 1 : i32 + %4 = fir.convert %c1_i32 : (i32) -> index + %5 = fir.load %arg3 : !fir.ref + %6 = fir.convert %5 : (i32) -> index + %c1 = arith.constant 1 : index + %7 = fir.convert %4 : (index) -> i32 + %8:2 = fir.do_loop %arg4 = %4 to %6 step %c1 iter_args(%arg5 = %7) -> (index, i32) { + fir.store %arg5 to %2 : !fir.ref + %c1_i32_0 = arith.constant 1 : i32 + %9 = fir.convert %c1_i32_0 : (i32) -> index + %10 = fir.load %arg2 : !fir.ref + %11 = fir.convert %10 : (i32) -> index + %c1_1 = arith.constant 1 : index + %12 = fir.convert %9 : (index) -> i32 + %13:2 = fir.do_loop %arg6 = %9 to %11 step %c1_1 iter_args(%arg7 = %12) -> (index, i32) { + fir.store %arg7 to %1 : !fir.ref + %c0_i32 = arith.constant 0 : i32 + %18 = fir.convert %c0_i32 : (i32) -> index + %19 = fir.load %arg1 : !fir.ref + %20 = fir.convert %19 : (i32) -> index + %c1_2 = arith.constant 1 : index + %21 = fir.convert %18 : (index) -> i32 + %22:2 = fir.do_loop %arg8 = %18 to %20 step %c1_2 iter_args(%arg9 = %21) -> (index, i32) { + fir.store %arg9 to %0 : !fir.ref + %27 = fir.load %3 : !fir.ref + %28 = fir.load %0 : !fir.ref + %29 = fir.convert %28 : (i32) -> i64 + %c1_i64 = arith.constant 1 : i64 + %30 = arith.subi %29, %c1_i64 : i64 + %31 = fir.load %1 : !fir.ref + %32 = fir.convert %31 : (i32) -> i64 + %c1_i64_3 = arith.constant 1 : i64 + %33 = arith.subi %32, %c1_i64_3 : i64 + %34 = fir.load %2 : !fir.ref + %35 = fir.convert %34 : (i32) -> i64 + %c1_i64_4 = arith.constant 1 : i64 + %36 = arith.subi %35, %c1_i64_4 : i64 + %37 = fir.coordinate_of %arg0, %30, %33, %36 : (!fir.box>, i64, i64, i64) -> !fir.ref + %38 = fir.load %37 : !fir.ref + %39 = arith.addf %27, %38 fastmath : f64 + fir.store %39 to %3 : !fir.ref + %40 = arith.addi %arg8, %c1_2 : index + %41 = fir.convert %c1_2 : (index) -> i32 + %42 = fir.load %0 : !fir.ref + %43 = arith.addi %42, %41 : i32 + fir.result %40, %43 : index, i32 + } + fir.store %22#1 to %0 : !fir.ref + %23 = arith.addi %arg6, %c1_1 : index + %24 = fir.convert %c1_1 : (index) -> i32 + %25 = fir.load %1 : !fir.ref + %26 = arith.addi %25, %24 : i32 + fir.result %23, %26 : index, i32 + } + fir.store %13#1 to %1 : !fir.ref + %14 = arith.addi %arg4, %c1 : index + %15 = fir.convert %c1 : (index) -> i32 + %16 = fir.load %2 : !fir.ref + %17 = arith.addi %16, %15 : i32 + fir.result %14, %17 : index, i32 + } + fir.store %8#1 to %2 : !fir.ref + return + } + +// Note this only checks the expected transformation, not the entire generated code: +// CHECK-LABEL: func.func @sum3d( +// CHECK-SAME: %[[ARG0:.*]]: !fir.box> {{.*}}) +// Only inner loop should be verisoned. +// CHECK: fir.do_loop +// CHECK: %[[ZERO:.*]] = arith.constant 0 : index +// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}} +// CHECK: %[[TWO:.*]] = arith.constant 2 : index +// CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[ARG0]], %[[TWO]] : {{.*}} +// CHECK: %[[SIZE:.*]] = arith.constant 8 : index +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]] +// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}} +// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]] +// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref> +// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}} +// Check the 3D -> 1D coordinate conversion, should have a multiply and a final add. +// Some other operations are checked to synch the different parts. +// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS2]]#2, {{.*}} +// CHECK: %[[MIDDLE_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}} +// CHECK: %[[MIDDLE_SUM:.*]] = arith.addi %[[MIDDLE_IDX]], %[[OUTER_IDX]] +// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index +// CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}} +// CHECK: %[[MIDDLE_DIV:.*]] = arith.divsi %[[MIDDLE_SUM]], %[[ITEMSIZE]] +// CHECK: %[[C3D:.*]] = arith.addi %[[MIDDLE_DIV]], %[[INNER_IDX]] +// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C3D]] : (!fir.ref>, index) -> !fir.ref +// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES]]#0, %[[LOOP_RES]]#1 +// CHECK: } else { +// CHECK: %[[LOOP_RES2:.*]]:2 = fir.do_loop {{.*}} +// CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.box>, i64, i64, i64) -> !fir.ref +// CHECK: %{{.*}}= fir.load %[[COORD2]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES2]]#0, %[[LOOP_RES2]]#1 +// CHECK: } +// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}} +// CHECK: return + } // End module