[FLANG] Support all arrays for LoopVersioning

This patch makes more than 2D arrays work, with a fix for the way that loop index is calculated. Removing the restriction of number of dimensions. This also changes the way that the actual index is calculated, such that the stride is used rather than the extent of the previous dimension. Some tests failed without fixing this - this was likely a latent bug in the 2D version too, but found in a test using 3D arrays, so wouldn't have been found with 2D only. This introduces a division on the index calculation - however it should be a nice and constant value allowing a shift to be used to actually divide - or otherwise removed by using other methods to calculate the result. In analysing code generated with optimisation at -O3, there are no divides produced. Some minor refactoring to avoid repeatedly asking for the "rank" of the array being worked on. This improves some of the SPEC-2017 ROMS code, in the same way as the limited 2D array improvements - less overhead spent calculating array indices in the inner-most loop and better use of vector-instructions. Reviewed By: kiranchandramohan Differential Revision: https://reviews.llvm.org/D151140
llvm · May 30, 2023 · b75f9ce · b75f9ce
1 parent b07d08b
commit b75f9ce
Show file tree

Hide file tree

Showing 2 changed files with 182 additions and 37 deletions.
diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -73,7 +73,6 @@ namespace {
 
 class LoopVersioningPass
     : public fir::impl::LoopVersioningBase<LoopVersioningPass> {
-
 public:
   void runOnOperation() override;
 };
@@ -105,6 +104,7 @@ void LoopVersioningPass::runOnOperation() {
   struct ArgInfo {
     mlir::Value *arg;
     size_t size;
+    unsigned rank;
     fir::BoxDimsOp dims[CFI_MAX_RANK];
   };
 
@@ -114,13 +114,11 @@ void LoopVersioningPass::runOnOperation() {
   mlir::Block::BlockArgListType args = func.getArguments();
   mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
   fir::KindMapping kindMap = fir::getKindMapping(module);
-  mlir::SmallVector<ArgInfo> argsOfInterest;
+  mlir::SmallVector<ArgInfo, 4> argsOfInterest;
   for (auto &arg : args) {
     if (auto seqTy = getAsSequenceType(&arg)) {
       unsigned rank = seqTy.getDimension();
-      // Currently limited to 1D or 2D arrays as that seems to give good
-      // improvement without excessive increase in code-size, etc.
-      if (rank > 0 && rank < 3 &&
+      if (rank > 0 &&
           seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) {
         size_t typeSize = 0;
         mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType());
@@ -130,12 +128,9 @@ void LoopVersioningPass::runOnOperation() {
         else if (auto cty = elementType.dyn_cast<fir::ComplexType>())
           typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8;
         if (typeSize)
-          argsOfInterest.push_back({&arg, typeSize, {}});
+          argsOfInterest.push_back({&arg, typeSize, rank, {}});
         else
           LLVM_DEBUG(llvm::dbgs() << "Type not supported\n");
-
-      } else {
-        LLVM_DEBUG(llvm::dbgs() << "Too many dimensions\n");
       }
     }
   }
@@ -145,14 +140,14 @@ void LoopVersioningPass::runOnOperation() {
 
   struct OpsWithArgs {
     mlir::Operation *op;
-    mlir::SmallVector<ArgInfo> argsAndDims;
+    mlir::SmallVector<ArgInfo, 4> argsAndDims;
   };
   // Now see if those arguments are used inside any loop.
   mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;
 
   func.walk([&](fir::DoLoopOp loop) {
     mlir::Block &body = *loop.getBody();
-    mlir::SmallVector<ArgInfo> argsInLoop;
+    mlir::SmallVector<ArgInfo, 4> argsInLoop;
     body.walk([&](fir::CoordinateOp op) {
       // The current operation could be inside another loop than
       // the one we're currently processing. Skip it, we'll get
@@ -199,16 +194,16 @@ void LoopVersioningPass::runOnOperation() {
     mlir::Value allCompares = nullptr;
     // Ensure all of the arrays are unit-stride.
     for (auto &arg : op.argsAndDims) {
-
-      fir::SequenceType seqTy = getAsSequenceType(arg.arg);
-      unsigned rank = seqTy.getDimension();
-
-      // We only care about lowest order dimension.
-      for (unsigned i = 0; i < rank; i++) {
+      // Fetch all the dimensions of the array, except the last dimension.
+      // Always fetch the first dimension, however, so set ndims = 1 if
+      // we have one dim
+      unsigned ndims = arg.rank;
+      for (unsigned i = 0; i < ndims; i++) {
         mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
         arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
                                                      *arg.arg, dimIdx);
       }
+      // We only care about lowest order dimension, here.
       mlir::Value elemSize =
           builder.createIntegerConstant(loc, idxTy, arg.size);
       mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
@@ -245,25 +240,41 @@ void LoopVersioningPass::runOnOperation() {
         // Reduce the multi-dimensioned index to a single index.
         // This is required becase fir arrays do not support multiple dimensions
         // with unknown dimensions at compile time.
+        // We then calculate the multidimensional array like this:
+        // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x)
+        // where stride is the distance between elements in the dimensions
+        // 0, 1 and 2 or x, y and z.
         if (coop->getOperand(0) == *arg.arg &&
             coop->getOperands().size() >= 2) {
           builder.setInsertionPoint(coop);
-          mlir::Value totalIndex = builder.createIntegerConstant(loc, idxTy, 0);
-          // Operand(1) = array; Operand(2) = index1; Operand(3) = index2
-          for (unsigned i = coop->getOperands().size() - 1; i > 1; i--) {
+          mlir::Value totalIndex;
+          for (unsigned i = arg.rank - 1; i > 0; i--) {
+            // Operand(1) = array; Operand(2) = index1; Operand(3) = index2
             mlir::Value curIndex =
-                builder.createConvert(loc, idxTy, coop->getOperand(i));
-            // First arg is Operand2, so dims[i-2] is 0-based i-1!
+                builder.createConvert(loc, idxTy, coop->getOperand(i + 1));
+            // Multiply by the stride of this array. Later we'll divide by the
+            // element size.
             mlir::Value scale =
-                builder.createConvert(loc, idxTy, arg.dims[i - 2].getResult(1));
+                builder.createConvert(loc, idxTy, arg.dims[i].getResult(2));
+            curIndex =
+                builder.create<mlir::arith::MulIOp>(loc, scale, curIndex);
+            totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>(
+                                            loc, curIndex, totalIndex)
+                                      : curIndex;
+          }
+          mlir::Value elemSize =
+              builder.createIntegerConstant(loc, idxTy, arg.size);
+          // This is the lowest dimension - which doesn't need scaling
+          mlir::Value finalIndex =
+              builder.createConvert(loc, idxTy, coop->getOperand(1));
+          if (totalIndex) {
             totalIndex = builder.create<mlir::arith::AddIOp>(
-                loc, totalIndex,
-                builder.create<mlir::arith::MulIOp>(loc, scale, curIndex));
+                loc,
+                builder.create<mlir::arith::DivSIOp>(loc, totalIndex, elemSize),
+                finalIndex);
+          } else {
+            totalIndex = finalIndex;
           }
-          totalIndex = builder.create<mlir::arith::AddIOp>(
-              loc, totalIndex,
-              builder.createConvert(loc, idxTy, coop->getOperand(1)));
-
           auto newOp = builder.create<fir::CoordinateOp>(
               loc, builder.getRefType(elementType), caddr,
               mlir::ValueRange{totalIndex});

diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir
@@ -156,8 +156,7 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK:    %[[CONV:.*]]  = fir.convert %[[Y]] : {{.*}}
 // CHECK:    %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]] : {{.*}}
 // CHECK:    fir.do_loop %[[INDEX:.*]] = {{.*}}
-// CHECK:    %[[IND_PLUS_1:.*]] = arith.addi %{{.*}}, %[[INDEX]]
-// CHECK:    %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[IND_PLUS_1]]
+// CHECK:    %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[INDEX]]
 // CHECK:    %[[YINT:.*]] = fir.load %[[YADDR]] : {{.*}}
 // CHECK:    %[[YINDEX:.*]] = fir.convert %[[YINT]]
 // CHECK:    %[[XADDR:.*]] = fir.array_coor %[[X]] [%{{.*}}] %[[YINDEX]]
@@ -269,7 +268,7 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK:     %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]]
 // CHECK:     %[[RES:.*]] = fir.do_loop {{.*}} {
 // CHECK:     %[[ADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %{{.*}}
-// CHECK:     %45 = fir.load %[[ADDR]] : !fir.ref<f32>
+// CHECK:     %{{.*}} = fir.load %[[ADDR]] : !fir.ref<f32>
 // CHECK:   }
 // CHECK:   fir.result %[[RES]] : {{.*}}
 // CHECK: } else {
@@ -355,19 +354,22 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // Only inner loop should be verisoned.
 // CHECK: fir.do_loop
 // CHECK: %[[ZERO:.*]] = arith.constant 0 : index
-// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
+// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
+// CHECK: %[[ONE:.*]] = arith.constant 1 : index
+// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}}
 // CHECK: %[[SIZE:.*]] = arith.constant 8 : index
-// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS]]#2, %[[SIZE]]
+// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]]
 // CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}}
 // CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]]
 // CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref<!fir.array<?xf64>>
 // CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}}
 // Check the 2D -> 1D coordinate conversion, should have a multiply and a final add.
 // Some other operations are checked to synch the different parts.
-// CHECK: arith.muli %[[DIMS]]#1, {{.*}}
-// CHECK: %[[OUTER_IDX:.*]] = arith.addi {{.*}}
+// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}}
+// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index
 // CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}}
-// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_IDX]], %[[INNER_IDX]]
+// CHECK: %[[OUTER_DIV:.*]] = arith.divsi %[[OUTER_IDX]], %[[ITEMSIZE]]
+// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_DIV]], %[[INNER_IDX]]
 // CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C2D]] : (!fir.ref<!fir.array<?xf64>>, index) -> !fir.ref<f64>
 // CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref<f64>
 // CHECK: fir.result %{{.*}}, %{{.*}}
@@ -384,4 +386,136 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK: fir.store %[[IF_RES]]#1 to %{{.*}}
 // CHECK: return
 
+// -----
+
+//   subroutine sum3d(a, nx, ny, nz)
+//    real*8 :: a(:, :, :)
+//    integer :: nx, ny, nz
+//    real*8 :: sum
+//    integer :: i, j, k
+//    sum = 0
+//    do k=1,nz
+//       do j=1,ny
+//          do i=0,nx
+//             sum = sum + a(i, j, k)
+//          end do
+//       end do
+//    end do
+//  end subroutine sum3d
+
+
+  func.func @sum3d(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "nx"}, %arg2: !fir.ref<i32> {fir.bindc_name = "ny"}, %arg3: !fir.ref<i32> {fir.bindc_name = "nz"}) {
+    %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmoduleFsum3dEi"}
+    %1 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmoduleFsum3dEj"}
+    %2 = fir.alloca i32 {bindc_name = "k", uniq_name = "_QMmoduleFsum3dEk"}
+    %3 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QMmoduleFsum3dEsum"}
+    %cst = arith.constant 0.000000e+00 : f64
+    fir.store %cst to %3 : !fir.ref<f64>
+    %c1_i32 = arith.constant 1 : i32
+    %4 = fir.convert %c1_i32 : (i32) -> index
+    %5 = fir.load %arg3 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> index
+    %c1 = arith.constant 1 : index
+    %7 = fir.convert %4 : (index) -> i32
+    %8:2 = fir.do_loop %arg4 = %4 to %6 step %c1 iter_args(%arg5 = %7) -> (index, i32) {
+      fir.store %arg5 to %2 : !fir.ref<i32>
+      %c1_i32_0 = arith.constant 1 : i32
+      %9 = fir.convert %c1_i32_0 : (i32) -> index
+      %10 = fir.load %arg2 : !fir.ref<i32>
+      %11 = fir.convert %10 : (i32) -> index
+      %c1_1 = arith.constant 1 : index
+      %12 = fir.convert %9 : (index) -> i32
+      %13:2 = fir.do_loop %arg6 = %9 to %11 step %c1_1 iter_args(%arg7 = %12) -> (index, i32) {
+        fir.store %arg7 to %1 : !fir.ref<i32>
+        %c0_i32 = arith.constant 0 : i32
+        %18 = fir.convert %c0_i32 : (i32) -> index
+        %19 = fir.load %arg1 : !fir.ref<i32>
+        %20 = fir.convert %19 : (i32) -> index
+        %c1_2 = arith.constant 1 : index
+        %21 = fir.convert %18 : (index) -> i32
+        %22:2 = fir.do_loop %arg8 = %18 to %20 step %c1_2 iter_args(%arg9 = %21) -> (index, i32) {
+          fir.store %arg9 to %0 : !fir.ref<i32>
+          %27 = fir.load %3 : !fir.ref<f64>
+          %28 = fir.load %0 : !fir.ref<i32>
+          %29 = fir.convert %28 : (i32) -> i64
+          %c1_i64 = arith.constant 1 : i64
+          %30 = arith.subi %29, %c1_i64 : i64
+          %31 = fir.load %1 : !fir.ref<i32>
+          %32 = fir.convert %31 : (i32) -> i64
+          %c1_i64_3 = arith.constant 1 : i64
+          %33 = arith.subi %32, %c1_i64_3 : i64
+          %34 = fir.load %2 : !fir.ref<i32>
+          %35 = fir.convert %34 : (i32) -> i64
+          %c1_i64_4 = arith.constant 1 : i64
+          %36 = arith.subi %35, %c1_i64_4 : i64
+          %37 = fir.coordinate_of %arg0, %30, %33, %36 : (!fir.box<!fir.array<?x?x?xf64>>, i64, i64, i64) -> !fir.ref<f64>
+          %38 = fir.load %37 : !fir.ref<f64>
+          %39 = arith.addf %27, %38 fastmath<contract> : f64
+          fir.store %39 to %3 : !fir.ref<f64>
+          %40 = arith.addi %arg8, %c1_2 : index
+          %41 = fir.convert %c1_2 : (index) -> i32
+          %42 = fir.load %0 : !fir.ref<i32>
+          %43 = arith.addi %42, %41 : i32
+          fir.result %40, %43 : index, i32
+        }
+        fir.store %22#1 to %0 : !fir.ref<i32>
+        %23 = arith.addi %arg6, %c1_1 : index
+        %24 = fir.convert %c1_1 : (index) -> i32
+        %25 = fir.load %1 : !fir.ref<i32>
+        %26 = arith.addi %25, %24 : i32
+        fir.result %23, %26 : index, i32
+      }
+      fir.store %13#1 to %1 : !fir.ref<i32>
+      %14 = arith.addi %arg4, %c1 : index
+      %15 = fir.convert %c1 : (index) -> i32
+      %16 = fir.load %2 : !fir.ref<i32>
+      %17 = arith.addi %16, %15 : i32
+      fir.result %14, %17 : index, i32
+    }
+    fir.store %8#1 to %2 : !fir.ref<i32>
+    return
+  }
+
+// Note this only checks the expected transformation, not the entire generated code:
+// CHECK-LABEL: func.func @sum3d(
+// CHECK-SAME:                  %[[ARG0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {{.*}})
+// Only inner loop should be verisoned.
+// CHECK: fir.do_loop
+// CHECK: %[[ZERO:.*]] = arith.constant 0 : index
+// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
+// CHECK: %[[ONE:.*]] = arith.constant 1 : index
+// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}}
+// CHECK: %[[TWO:.*]] = arith.constant 2 : index
+// CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[ARG0]], %[[TWO]] : {{.*}}
+// CHECK: %[[SIZE:.*]] = arith.constant 8 : index
+// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]]
+// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}}
+// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]]
+// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref<!fir.array<?xf64>>
+// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}}
+// Check the 3D -> 1D coordinate conversion, should have a multiply and a final add.
+// Some other operations are checked to synch the different parts.
+// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS2]]#2, {{.*}}
+// CHECK: %[[MIDDLE_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}}
+// CHECK: %[[MIDDLE_SUM:.*]] = arith.addi %[[MIDDLE_IDX]], %[[OUTER_IDX]]
+// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index
+// CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}}
+// CHECK: %[[MIDDLE_DIV:.*]] = arith.divsi %[[MIDDLE_SUM]], %[[ITEMSIZE]]
+// CHECK: %[[C3D:.*]] = arith.addi %[[MIDDLE_DIV]], %[[INNER_IDX]]
+// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C3D]] : (!fir.ref<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref<f64>
+// CHECK: fir.result %{{.*}}, %{{.*}}
+// CHECK: }
+// CHECK  fir.result %[[LOOP_RES]]#0, %[[LOOP_RES]]#1
+// CHECK: } else {
+// CHECK: %[[LOOP_RES2:.*]]:2 = fir.do_loop {{.*}}
+// CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.box<!fir.array<?x?x?xf64>>, i64, i64, i64) -> !fir.ref<f64>
+// CHECK: %{{.*}}= fir.load %[[COORD2]] : !fir.ref<f64>
+// CHECK: fir.result %{{.*}}, %{{.*}}
+// CHECK: }
+// CHECK  fir.result %[[LOOP_RES2]]#0, %[[LOOP_RES2]]#1
+// CHECK: }
+// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}}
+// CHECK: return
+
 } // End module