[flang][fir] Convert `fir.do_loop` with the unordered attribute to `scf.parallel`. #168510

NexMing · 2025-11-18T10:35:44Z

Refines the existing conversion to allow fir.do_loop annotated with unordered to be lowered to scf.parallel, while other loops retain their original lowering.

…cf.parallel`.

llvmbot · 2025-11-18T10:36:15Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: Ming Yan (NexMing)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/168510.diff

2 Files Affected:

(modified) flang/lib/Optimizer/Transforms/FIRToSCF.cpp (+30-17)
(modified) flang/test/Fir/FirToSCF/do-loop.fir (+52-3)

diff --git a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
index 70d6ebbcb039c..662bdbf28a6dc 100644
--- a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
+++ b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
@@ -30,6 +30,7 @@ struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = doLoopOp.getLoc();
     bool hasFinalValue = doLoopOp.getFinalValue().has_value();
+    bool isUnordered = doLoopOp.getUnordered().has_value();
 
     // Get loop values from the DoLoopOp
     mlir::Value low = doLoopOp.getLowerBound();
@@ -53,37 +54,49 @@ struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
         mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
     auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
     auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-    auto scfForOp =
-        mlir::scf::ForOp::create(rewriter, loc, zero, tripCount, one, iterArgs);
 
+    // Create the scf.for or scf.parallel operation
+    mlir::Operation *scfLoopOp = nullptr;
+    if (isUnordered) {
+      scfLoopOp = mlir::scf::ParallelOp::create(rewriter, loc, {zero},
+                                                {tripCount}, {one}, iterArgs);
+    } else {
+      scfLoopOp = mlir::scf::ForOp::create(rewriter, loc, zero, tripCount, one,
+                                           iterArgs);
+    }
+
+    // Move the body of the fir.do_loop to the scf.for or scf.parallel
     auto &loopOps = doLoopOp.getBody()->getOperations();
     auto resultOp =
         mlir::cast<fir::ResultOp>(doLoopOp.getBody()->getTerminator());
     auto results = resultOp.getOperands();
-    mlir::Block *loweredBody = scfForOp.getBody();
+    auto scfLoopLikeOp = mlir::cast<mlir::LoopLikeOpInterface>(scfLoopOp);
+    mlir::Block &scfLoopBody = scfLoopLikeOp.getLoopRegions().front()->front();
 
-    loweredBody->getOperations().splice(loweredBody->begin(), loopOps,
-                                        loopOps.begin(),
-                                        std::prev(loopOps.end()));
+    scfLoopBody.getOperations().splice(scfLoopBody.begin(), loopOps,
+                                       loopOps.begin(),
+                                       std::prev(loopOps.end()));
 
-    rewriter.setInsertionPointToStart(loweredBody);
+    rewriter.setInsertionPointToStart(&scfLoopBody);
     mlir::Value iv = mlir::arith::MulIOp::create(
-        rewriter, loc, scfForOp.getInductionVar(), step);
+        rewriter, loc, scfLoopLikeOp.getSingleInductionVar().value(), step);
     iv = mlir::arith::AddIOp::create(rewriter, loc, low, iv);
 
     if (!results.empty()) {
-      rewriter.setInsertionPointToEnd(loweredBody);
+      rewriter.setInsertionPointToEnd(&scfLoopBody);
       mlir::scf::YieldOp::create(rewriter, resultOp->getLoc(), results);
     }
     doLoopOp.getInductionVar().replaceAllUsesWith(iv);
-    rewriter.replaceAllUsesWith(doLoopOp.getRegionIterArgs(),
-                                hasFinalValue
-                                    ? scfForOp.getRegionIterArgs().drop_front()
-                                    : scfForOp.getRegionIterArgs());
-
-    // Copy all the attributes from the old to new op.
-    scfForOp->setAttrs(doLoopOp->getAttrs());
-    rewriter.replaceOp(doLoopOp, scfForOp);
+    rewriter.replaceAllUsesWith(
+        doLoopOp.getRegionIterArgs(),
+        hasFinalValue ? scfLoopLikeOp.getRegionIterArgs().drop_front()
+                      : scfLoopLikeOp.getRegionIterArgs());
+
+    // Copy loop annotations from the fir.do_loop to scf loop op.
+    if (auto ann = doLoopOp.getLoopAnnotation())
+      scfLoopOp->setAttr("loop_annotation", *ann);
+
+    rewriter.replaceOp(doLoopOp, scfLoopOp);
     return mlir::success();
   }
 };
diff --git a/flang/test/Fir/FirToSCF/do-loop.fir b/flang/test/Fir/FirToSCF/do-loop.fir
index 812497c8d0c74..aa8526febeefc 100644
--- a/flang/test/Fir/FirToSCF/do-loop.fir
+++ b/flang/test/Fir/FirToSCF/do-loop.fir
@@ -1,4 +1,4 @@
-// RUN: fir-opt %s --fir-to-scf | FileCheck %s
+// RUN: fir-opt %s --fir-to-scf --split-input-file | FileCheck %s
 
 // CHECK-LABEL:   func.func @simple_loop(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>) {
@@ -31,6 +31,8 @@ func.func @simple_loop(%arg0: !fir.ref<!fir.array<100xi32>>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL:   func.func @loop_with_negtive_step(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>) {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 100 : index
@@ -64,6 +66,8 @@ func.func @loop_with_negtive_step(%arg0: !fir.ref<!fir.array<100xi32>>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL:   func.func @loop_with_results(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>,
 // CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32>) {
@@ -102,6 +106,8 @@ func.func @loop_with_results(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !fir.r
   return
 }
 
+// -----
+
 // CHECK-LABEL:   func.func @loop_with_final_value(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>,
 // CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32>) {
@@ -146,6 +152,44 @@ func.func @loop_with_final_value(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !f
   return
 }
 
+// -----
+
+// CHECK-LABEL:   func.func @loop_with_unordered_attr(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>) {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 100 : index
+// CHECK:           %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+// CHECK:           %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_0]] : index
+// CHECK:           %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_0]] : index
+// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+// CHECK:           scf.parallel (%[[VAL_0:.*]]) = (%[[CONSTANT_3]]) to (%[[DIVSI_0]]) step (%[[CONSTANT_4]]) {
+// CHECK:             %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[CONSTANT_0]] : index
+// CHECK:             %[[ADDI_1:.*]] = arith.addi %[[CONSTANT_0]], %[[MULI_0]] : index
+// CHECK:             %[[ARRAY_COOR_0:.*]] = fir.array_coor %[[ARG0]](%[[SHAPE_0]]) %[[ADDI_1]] : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK:             fir.store %[[CONSTANT_2]] to %[[ARRAY_COOR_0]] : !fir.ref<i32>
+// CHECK:             scf.reduce
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+func.func @loop_with_unordered_attr(%arg0: !fir.ref<!fir.array<100xi32>>) {
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+  %c1_i32 = arith.constant 1 : i32
+  fir.do_loop %arg1 = %c1 to %c100 step %c1 unordered {
+    %1 = fir.array_coor %arg0(%0) %arg1 : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+    fir.store %c1_i32 to %1 : !fir.ref<i32>
+  }
+  return
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = #llvm.loop_vectorize<disable = false>
+// CHECK: #[[$ATTR_1:.+]] = #llvm.loop_annotation<vectorize = #[[$ATTR_0]]>
 // CHECK-LABEL:   func.func @loop_with_attribute(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>,
 // CHECK-SAME:      %[[ARG1:.*]]: !fir.ref<i32>) {
@@ -167,16 +211,19 @@ func.func @loop_with_final_value(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !f
 // CHECK:             %[[VAL_15:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
 // CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_14]] : i32
 // CHECK:             fir.store %[[VAL_16]] to %[[VAL_3]] : !fir.ref<i32>
-// CHECK:           } {operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, reduceAttrs = [#fir.reduce_attr<add>]}
+// CHECK:           } {loop_annotation = #[[$ATTR_1]]}
 // CHECK:           return
 // CHECK:         }
+
+#loop_vectorize = #llvm.loop_vectorize<disable = false>
+#loop_annotation = #llvm.loop_annotation<vectorize = #loop_vectorize>
 func.func @loop_with_attribute(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !fir.ref<i32>) {
   %c1 = arith.constant 1 : index
   %c0_i32 = arith.constant 0 : i32
   %c100 = arith.constant 100 : index
   %0 = fir.alloca i32
   %1 = fir.shape %c100 : (index) -> !fir.shape<1>
-  fir.do_loop %arg2 = %c1 to %c100 step %c1 reduce(#fir.reduce_attr<add> -> %0 : !fir.ref<i32>) {
+  fir.do_loop %arg2 = %c1 to %c100 step %c1 attributes {loopAnnotation = #loop_annotation} {
     %2 = fir.array_coor %arg0(%1) %arg2 : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
     %3 = fir.load %2 : !fir.ref<i32>
     %4 = fir.load %0 : !fir.ref<i32>
@@ -187,6 +234,8 @@ func.func @loop_with_attribute(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !fir
   return
 }
 
+// -----
+
 // CHECK-LABEL:   func.func @nested_loop(
 // CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.array<100x100xi32>>) {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index

github-actions · 2025-11-18T10:54:31Z

🐧 Linux x64 Test Results

4063 tests passed
202 tests skipped

tblah

I would have thought that this would need some work to decide whether parallelising the loop is going to be profitable, but if it works for your use case that's fine by me (as this pass is not enabled by default).

Executing the loop body in parallel is a stronger requirement on the loop body than merely out of order. A loop executed sequentially but out of order still can't have race conditions and can use the same memory address for each variable for each loop iteration. A loop running in parallel may need thread private allocations of variables in order to produce correct results, even if there are no memory dependencies between each iteration. Plus side effects might not be safe to run in parallel. Two examples:

integer :: tmp
integer :: i
integer, dimension(32) :: results

! Flang does not currently add the unordered attribute but it would not be incorrect
! because executing this sequentially in any order will produce the same result.
! However, if this were executed in parallel, multiple threads could try to read/write to
! tmp at the same time.
do i = 1,32
  tmp = i * 3
  results(i) = tmp * (tmp + 1)
end do

subroutine printMsg
  print *,"hello"
end subroutine

subroutine printMsg100Times
  integer :: i
  ! Flang again would not currently add the unordered attribute but it would not be 
  ! incorrect to do so. The loop body is invariant between iterations so there is no
  ! observable difference from executing this loop out of order. However, the side effects
  ! of the loop body may not be safe to run concurrently. In this case, if the text output
  ! was flushed after each character, and multiple threads wrote characters at the same
  ! time, the output could be jumpbled e.g. "hehhehloolloo"
  do i=1,100
    call printMsg()
  end do
end subroutine

I'm worried that adding a potentially unsound transformation like this (even if it works most of the time for the loops that flang would actually apply unordered to) could limit the usefulness of this pass.

One solution would be to continue to transform these loops to scf.for by default and add a pass option which enables this transformation to scf.parallel. You can set that option when you create your custom pipeline using this pass.

NexMing · 2025-11-21T05:13:36Z

One solution would be to continue to transform these loops to scf.for by default and add a pass option which enables this transformation to scf.parallel. You can set that option when you create your custom pipeline using this pass.

Thanks for the explanation. I agree with controlling it via an option.

NexMing · 2025-11-24T03:28:04Z

ping.

tblah

LGTM. Sorry for the delayed response.

flang/include/flang/Optimizer/Transforms/Passes.td

NexMing added 2 commits November 18, 2025 18:15

[flang][fir] Convert fir.do_loop with the unordered attribute to `s…

316714c

…cf.parallel`.

Only keep attributes that are valid for scf loop op.

45a330f

llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Nov 18, 2025

NexMing requested review from clementval, kiranchandramohan, rscottmanley, tblah and vzakhari November 18, 2025 10:36

tblah reviewed Nov 20, 2025

View reviewed changes

Add an option to control this conversion.

1446f96

NexMing requested a review from tblah November 21, 2025 05:17

tblah approved these changes Nov 25, 2025

View reviewed changes

flang/include/flang/Optimizer/Transforms/Passes.td Outdated Show resolved Hide resolved

[NFC] Update comments.

4cb2853

NexMing enabled auto-merge (squash) November 25, 2025 14:36

NexMing merged commit 25c95eb into llvm:main Nov 25, 2025
10 checks passed

NexMing deleted the dev/fir-to-scf branch November 25, 2025 14:43

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][fir] Convert `fir.do_loop` with the unordered attribute to `scf.parallel`. #168510

[flang][fir] Convert `fir.do_loop` with the unordered attribute to `scf.parallel`. #168510

NexMing commented Nov 18, 2025 •

edited

Loading

Uh oh!

llvmbot commented Nov 18, 2025

Uh oh!

github-actions bot commented Nov 18, 2025 •

edited

Loading

Uh oh!

tblah left a comment

Uh oh!

NexMing commented Nov 21, 2025

Uh oh!

NexMing commented Nov 24, 2025

Uh oh!

tblah left a comment

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[flang][fir] Convert fir.do_loop with the unordered attribute to scf.parallel. #168510

[flang][fir] Convert fir.do_loop with the unordered attribute to scf.parallel. #168510

Conversation

NexMing commented Nov 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 18, 2025

Uh oh!

github-actions bot commented Nov 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

tblah left a comment

Choose a reason for hiding this comment

Uh oh!

NexMing commented Nov 21, 2025

Uh oh!

NexMing commented Nov 24, 2025

Uh oh!

tblah left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[flang][fir] Convert `fir.do_loop` with the unordered attribute to `scf.parallel`. #168510

[flang][fir] Convert `fir.do_loop` with the unordered attribute to `scf.parallel`. #168510

NexMing commented Nov 18, 2025 •

edited

Loading

github-actions bot commented Nov 18, 2025 •

edited

Loading