diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index 4695d39ee41cb..f1588d2f622a1 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -73,7 +73,6 @@ namespace {
 
 class LoopVersioningPass
     : public fir::impl::LoopVersioningBase<LoopVersioningPass> {
-
 public:
   void runOnOperation() override;
 };
@@ -105,6 +104,7 @@ void LoopVersioningPass::runOnOperation() {
   struct ArgInfo {
     mlir::Value *arg;
     size_t size;
+    unsigned rank;
     fir::BoxDimsOp dims[CFI_MAX_RANK];
   };
 
@@ -114,13 +114,11 @@ void LoopVersioningPass::runOnOperation() {
   mlir::Block::BlockArgListType args = func.getArguments();
   mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
   fir::KindMapping kindMap = fir::getKindMapping(module);
-  mlir::SmallVector<ArgInfo> argsOfInterest;
+  mlir::SmallVector<ArgInfo, 4> argsOfInterest;
   for (auto &arg : args) {
     if (auto seqTy = getAsSequenceType(&arg)) {
       unsigned rank = seqTy.getDimension();
-      // Currently limited to 1D or 2D arrays as that seems to give good
-      // improvement without excessive increase in code-size, etc.
-      if (rank > 0 && rank < 3 &&
+      if (rank > 0 &&
           seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) {
         size_t typeSize = 0;
         mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType());
@@ -130,12 +128,9 @@ void LoopVersioningPass::runOnOperation() {
         else if (auto cty = elementType.dyn_cast<fir::ComplexType>())
           typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8;
         if (typeSize)
-          argsOfInterest.push_back({&arg, typeSize, {}});
+          argsOfInterest.push_back({&arg, typeSize, rank, {}});
         else
           LLVM_DEBUG(llvm::dbgs() << "Type not supported\n");
-
-      } else {
-        LLVM_DEBUG(llvm::dbgs() << "Too many dimensions\n");
       }
     }
   }
@@ -145,14 +140,14 @@ void LoopVersioningPass::runOnOperation() {
 
   struct OpsWithArgs {
     mlir::Operation *op;
-    mlir::SmallVector<ArgInfo> argsAndDims;
+    mlir::SmallVector<ArgInfo, 4> argsAndDims;
   };
   // Now see if those arguments are used inside any loop.
   mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;
 
   func.walk([&](fir::DoLoopOp loop) {
     mlir::Block &body = *loop.getBody();
-    mlir::SmallVector<ArgInfo> argsInLoop;
+    mlir::SmallVector<ArgInfo, 4> argsInLoop;
     body.walk([&](fir::CoordinateOp op) {
       // The current operation could be inside another loop than
       // the one we're currently processing. Skip it, we'll get
@@ -199,16 +194,16 @@ void LoopVersioningPass::runOnOperation() {
     mlir::Value allCompares = nullptr;
     // Ensure all of the arrays are unit-stride.
     for (auto &arg : op.argsAndDims) {
-
-      fir::SequenceType seqTy = getAsSequenceType(arg.arg);
-      unsigned rank = seqTy.getDimension();
-
-      // We only care about lowest order dimension.
-      for (unsigned i = 0; i < rank; i++) {
+      // Fetch all the dimensions of the array, except the last dimension.
+      // Always fetch the first dimension, however, so set ndims = 1 if
+      // we have one dim
+      unsigned ndims = arg.rank;
+      for (unsigned i = 0; i < ndims; i++) {
         mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
         arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
                                                      *arg.arg, dimIdx);
       }
+      // We only care about lowest order dimension, here.
       mlir::Value elemSize =
           builder.createIntegerConstant(loc, idxTy, arg.size);
       mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
@@ -245,25 +240,41 @@ void LoopVersioningPass::runOnOperation() {
         // Reduce the multi-dimensioned index to a single index.
         // This is required becase fir arrays do not support multiple dimensions
         // with unknown dimensions at compile time.
+        // We then calculate the multidimensional array like this:
+        // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x)
+        // where stride is the distance between elements in the dimensions
+        // 0, 1 and 2 or x, y and z.
         if (coop->getOperand(0) == *arg.arg &&
             coop->getOperands().size() >= 2) {
           builder.setInsertionPoint(coop);
-          mlir::Value totalIndex = builder.createIntegerConstant(loc, idxTy, 0);
-          // Operand(1) = array; Operand(2) = index1; Operand(3) = index2
-          for (unsigned i = coop->getOperands().size() - 1; i > 1; i--) {
+          mlir::Value totalIndex;
+          for (unsigned i = arg.rank - 1; i > 0; i--) {
+            // Operand(1) = array; Operand(2) = index1; Operand(3) = index2
             mlir::Value curIndex =
-                builder.createConvert(loc, idxTy, coop->getOperand(i));
-            // First arg is Operand2, so dims[i-2] is 0-based i-1!
+                builder.createConvert(loc, idxTy, coop->getOperand(i + 1));
+            // Multiply by the stride of this array. Later we'll divide by the
+            // element size.
             mlir::Value scale =
-                builder.createConvert(loc, idxTy, arg.dims[i - 2].getResult(1));
+                builder.createConvert(loc, idxTy, arg.dims[i].getResult(2));
+            curIndex =
+                builder.create<mlir::arith::MulIOp>(loc, scale, curIndex);
+            totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>(
+                                            loc, curIndex, totalIndex)
+                                      : curIndex;
+          }
+          mlir::Value elemSize =
+              builder.createIntegerConstant(loc, idxTy, arg.size);
+          // This is the lowest dimension - which doesn't need scaling
+          mlir::Value finalIndex =
+              builder.createConvert(loc, idxTy, coop->getOperand(1));
+          if (totalIndex) {
             totalIndex = builder.create<mlir::arith::AddIOp>(
-                loc, totalIndex,
-                builder.create<mlir::arith::MulIOp>(loc, scale, curIndex));
+                loc,
+                builder.create<mlir::arith::DivSIOp>(loc, totalIndex, elemSize),
+                finalIndex);
+          } else {
+            totalIndex = finalIndex;
           }
-          totalIndex = builder.create<mlir::arith::AddIOp>(
-              loc, totalIndex,
-              builder.createConvert(loc, idxTy, coop->getOperand(1)));
-
           auto newOp = builder.create<fir::CoordinateOp>(
               loc, builder.getRefType(elementType), caddr,
               mlir::ValueRange{totalIndex});
diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir
index bad02b7a226c5..3c8930ccbde47 100644
--- a/flang/test/Transforms/loop-versioning.fir
+++ b/flang/test/Transforms/loop-versioning.fir
@@ -156,8 +156,7 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK:    %[[CONV:.*]]  = fir.convert %[[Y]] : {{.*}}
 // CHECK:    %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]] : {{.*}}
 // CHECK:    fir.do_loop %[[INDEX:.*]] = {{.*}}
-// CHECK:    %[[IND_PLUS_1:.*]] = arith.addi %{{.*}}, %[[INDEX]]
-// CHECK:    %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[IND_PLUS_1]]
+// CHECK:    %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[INDEX]]
 // CHECK:    %[[YINT:.*]] = fir.load %[[YADDR]] : {{.*}}
 // CHECK:    %[[YINDEX:.*]] = fir.convert %[[YINT]]
 // CHECK:    %[[XADDR:.*]] = fir.array_coor %[[X]] [%{{.*}}] %[[YINDEX]]
@@ -269,7 +268,7 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK:     %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]]
 // CHECK:     %[[RES:.*]] = fir.do_loop {{.*}} {
 // CHECK:     %[[ADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %{{.*}}
-// CHECK:     %45 = fir.load %[[ADDR]] : !fir.ref<f32>
+// CHECK:     %{{.*}} = fir.load %[[ADDR]] : !fir.ref<f32>
 // CHECK:   }
 // CHECK:   fir.result %[[RES]] : {{.*}}
 // CHECK: } else {
@@ -355,19 +354,22 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // Only inner loop should be verisoned.
 // CHECK: fir.do_loop
 // CHECK: %[[ZERO:.*]] = arith.constant 0 : index
-// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
+// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
+// CHECK: %[[ONE:.*]] = arith.constant 1 : index
+// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}}
 // CHECK: %[[SIZE:.*]] = arith.constant 8 : index
-// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS]]#2, %[[SIZE]]
+// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]]
 // CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}}
 // CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]]
 // CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref<!fir.array<?xf64>>
 // CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}}
 // Check the 2D -> 1D coordinate conversion, should have a multiply and a final add.
 // Some other operations are checked to synch the different parts.
-// CHECK: arith.muli %[[DIMS]]#1, {{.*}}
-// CHECK: %[[OUTER_IDX:.*]] = arith.addi {{.*}}
+// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}}
+// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index
 // CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}}
-// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_IDX]], %[[INNER_IDX]]
+// CHECK: %[[OUTER_DIV:.*]] = arith.divsi %[[OUTER_IDX]], %[[ITEMSIZE]]
+// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_DIV]], %[[INNER_IDX]]
 // CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C2D]] : (!fir.ref<!fir.array<?xf64>>, index) -> !fir.ref<f64>
 // CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref<f64>
 // CHECK: fir.result %{{.*}}, %{{.*}}
@@ -384,4 +386,136 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK: fir.store %[[IF_RES]]#1 to %{{.*}}
 // CHECK: return
 
+// -----
+
+//   subroutine sum3d(a, nx, ny, nz)
+//    real*8 :: a(:, :, :)
+//    integer :: nx, ny, nz
+//    real*8 :: sum
+//    integer :: i, j, k
+//    sum = 0
+//    do k=1,nz
+//       do j=1,ny
+//          do i=0,nx
+//             sum = sum + a(i, j, k)
+//          end do
+//       end do
+//    end do
+//  end subroutine sum3d
+
+
+  func.func @sum3d(%arg0: !fir.box<!fir.array<?x?x?xf64>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "nx"}, %arg2: !fir.ref<i32> {fir.bindc_name = "ny"}, %arg3: !fir.ref<i32> {fir.bindc_name = "nz"}) {
+    %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmoduleFsum3dEi"}
+    %1 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmoduleFsum3dEj"}
+    %2 = fir.alloca i32 {bindc_name = "k", uniq_name = "_QMmoduleFsum3dEk"}
+    %3 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QMmoduleFsum3dEsum"}
+    %cst = arith.constant 0.000000e+00 : f64
+    fir.store %cst to %3 : !fir.ref<f64>
+    %c1_i32 = arith.constant 1 : i32
+    %4 = fir.convert %c1_i32 : (i32) -> index
+    %5 = fir.load %arg3 : !fir.ref<i32>
+    %6 = fir.convert %5 : (i32) -> index
+    %c1 = arith.constant 1 : index
+    %7 = fir.convert %4 : (index) -> i32
+    %8:2 = fir.do_loop %arg4 = %4 to %6 step %c1 iter_args(%arg5 = %7) -> (index, i32) {
+      fir.store %arg5 to %2 : !fir.ref<i32>
+      %c1_i32_0 = arith.constant 1 : i32
+      %9 = fir.convert %c1_i32_0 : (i32) -> index
+      %10 = fir.load %arg2 : !fir.ref<i32>
+      %11 = fir.convert %10 : (i32) -> index
+      %c1_1 = arith.constant 1 : index
+      %12 = fir.convert %9 : (index) -> i32
+      %13:2 = fir.do_loop %arg6 = %9 to %11 step %c1_1 iter_args(%arg7 = %12) -> (index, i32) {
+        fir.store %arg7 to %1 : !fir.ref<i32>
+        %c0_i32 = arith.constant 0 : i32
+        %18 = fir.convert %c0_i32 : (i32) -> index
+        %19 = fir.load %arg1 : !fir.ref<i32>
+        %20 = fir.convert %19 : (i32) -> index
+        %c1_2 = arith.constant 1 : index
+        %21 = fir.convert %18 : (index) -> i32
+        %22:2 = fir.do_loop %arg8 = %18 to %20 step %c1_2 iter_args(%arg9 = %21) -> (index, i32) {
+          fir.store %arg9 to %0 : !fir.ref<i32>
+          %27 = fir.load %3 : !fir.ref<f64>
+          %28 = fir.load %0 : !fir.ref<i32>
+          %29 = fir.convert %28 : (i32) -> i64
+          %c1_i64 = arith.constant 1 : i64
+          %30 = arith.subi %29, %c1_i64 : i64
+          %31 = fir.load %1 : !fir.ref<i32>
+          %32 = fir.convert %31 : (i32) -> i64
+          %c1_i64_3 = arith.constant 1 : i64
+          %33 = arith.subi %32, %c1_i64_3 : i64
+          %34 = fir.load %2 : !fir.ref<i32>
+          %35 = fir.convert %34 : (i32) -> i64
+          %c1_i64_4 = arith.constant 1 : i64
+          %36 = arith.subi %35, %c1_i64_4 : i64
+          %37 = fir.coordinate_of %arg0, %30, %33, %36 : (!fir.box<!fir.array<?x?x?xf64>>, i64, i64, i64) -> !fir.ref<f64>
+          %38 = fir.load %37 : !fir.ref<f64>
+          %39 = arith.addf %27, %38 fastmath<contract> : f64
+          fir.store %39 to %3 : !fir.ref<f64>
+          %40 = arith.addi %arg8, %c1_2 : index
+          %41 = fir.convert %c1_2 : (index) -> i32
+          %42 = fir.load %0 : !fir.ref<i32>
+          %43 = arith.addi %42, %41 : i32
+          fir.result %40, %43 : index, i32
+        }
+        fir.store %22#1 to %0 : !fir.ref<i32>
+        %23 = arith.addi %arg6, %c1_1 : index
+        %24 = fir.convert %c1_1 : (index) -> i32
+        %25 = fir.load %1 : !fir.ref<i32>
+        %26 = arith.addi %25, %24 : i32
+        fir.result %23, %26 : index, i32
+      }
+      fir.store %13#1 to %1 : !fir.ref<i32>
+      %14 = arith.addi %arg4, %c1 : index
+      %15 = fir.convert %c1 : (index) -> i32
+      %16 = fir.load %2 : !fir.ref<i32>
+      %17 = arith.addi %16, %15 : i32
+      fir.result %14, %17 : index, i32
+    }
+    fir.store %8#1 to %2 : !fir.ref<i32>
+    return
+  }
+  
+// Note this only checks the expected transformation, not the entire generated code:
+// CHECK-LABEL: func.func @sum3d(
+// CHECK-SAME:                  %[[ARG0:.*]]: !fir.box<!fir.array<?x?x?xf64>> {{.*}})
+// Only inner loop should be verisoned.
+// CHECK: fir.do_loop
+// CHECK: %[[ZERO:.*]] = arith.constant 0 : index
+// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}}
+// CHECK: %[[ONE:.*]] = arith.constant 1 : index
+// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}}
+// CHECK: %[[TWO:.*]] = arith.constant 2 : index
+// CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[ARG0]], %[[TWO]] : {{.*}}
+// CHECK: %[[SIZE:.*]] = arith.constant 8 : index
+// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]]
+// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}}
+// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]]
+// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref<!fir.array<?xf64>>
+// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}}
+// Check the 3D -> 1D coordinate conversion, should have a multiply and a final add.
+// Some other operations are checked to synch the different parts.
+// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS2]]#2, {{.*}}
+// CHECK: %[[MIDDLE_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}}
+// CHECK: %[[MIDDLE_SUM:.*]] = arith.addi %[[MIDDLE_IDX]], %[[OUTER_IDX]]
+// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index
+// CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}}
+// CHECK: %[[MIDDLE_DIV:.*]] = arith.divsi %[[MIDDLE_SUM]], %[[ITEMSIZE]]
+// CHECK: %[[C3D:.*]] = arith.addi %[[MIDDLE_DIV]], %[[INNER_IDX]]
+// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C3D]] : (!fir.ref<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref<f64>
+// CHECK: fir.result %{{.*}}, %{{.*}}
+// CHECK: }
+// CHECK  fir.result %[[LOOP_RES]]#0, %[[LOOP_RES]]#1
+// CHECK: } else {
+// CHECK: %[[LOOP_RES2:.*]]:2 = fir.do_loop {{.*}}
+// CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.box<!fir.array<?x?x?xf64>>, i64, i64, i64) -> !fir.ref<f64>
+// CHECK: %{{.*}}= fir.load %[[COORD2]] : !fir.ref<f64>
+// CHECK: fir.result %{{.*}}, %{{.*}}
+// CHECK: }
+// CHECK  fir.result %[[LOOP_RES2]]#0, %[[LOOP_RES2]]#1
+// CHECK: }
+// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}}
+// CHECK: return
+
 } // End module