-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[flang][fir] Convert fir.do_loop with the unordered attribute to scf.parallel.
#168510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-flang-fir-hlfir Author: Ming Yan (NexMing) ChangesFull diff: https://github.com/llvm/llvm-project/pull/168510.diff 2 Files Affected:
diff --git a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
index 70d6ebbcb039c..662bdbf28a6dc 100644
--- a/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
+++ b/flang/lib/Optimizer/Transforms/FIRToSCF.cpp
@@ -30,6 +30,7 @@ struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
mlir::PatternRewriter &rewriter) const override {
mlir::Location loc = doLoopOp.getLoc();
bool hasFinalValue = doLoopOp.getFinalValue().has_value();
+ bool isUnordered = doLoopOp.getUnordered().has_value();
// Get loop values from the DoLoopOp
mlir::Value low = doLoopOp.getLowerBound();
@@ -53,37 +54,49 @@ struct DoLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
- auto scfForOp =
- mlir::scf::ForOp::create(rewriter, loc, zero, tripCount, one, iterArgs);
+ // Create the scf.for or scf.parallel operation
+ mlir::Operation *scfLoopOp = nullptr;
+ if (isUnordered) {
+ scfLoopOp = mlir::scf::ParallelOp::create(rewriter, loc, {zero},
+ {tripCount}, {one}, iterArgs);
+ } else {
+ scfLoopOp = mlir::scf::ForOp::create(rewriter, loc, zero, tripCount, one,
+ iterArgs);
+ }
+
+ // Move the body of the fir.do_loop to the scf.for or scf.parallel
auto &loopOps = doLoopOp.getBody()->getOperations();
auto resultOp =
mlir::cast<fir::ResultOp>(doLoopOp.getBody()->getTerminator());
auto results = resultOp.getOperands();
- mlir::Block *loweredBody = scfForOp.getBody();
+ auto scfLoopLikeOp = mlir::cast<mlir::LoopLikeOpInterface>(scfLoopOp);
+ mlir::Block &scfLoopBody = scfLoopLikeOp.getLoopRegions().front()->front();
- loweredBody->getOperations().splice(loweredBody->begin(), loopOps,
- loopOps.begin(),
- std::prev(loopOps.end()));
+ scfLoopBody.getOperations().splice(scfLoopBody.begin(), loopOps,
+ loopOps.begin(),
+ std::prev(loopOps.end()));
- rewriter.setInsertionPointToStart(loweredBody);
+ rewriter.setInsertionPointToStart(&scfLoopBody);
mlir::Value iv = mlir::arith::MulIOp::create(
- rewriter, loc, scfForOp.getInductionVar(), step);
+ rewriter, loc, scfLoopLikeOp.getSingleInductionVar().value(), step);
iv = mlir::arith::AddIOp::create(rewriter, loc, low, iv);
if (!results.empty()) {
- rewriter.setInsertionPointToEnd(loweredBody);
+ rewriter.setInsertionPointToEnd(&scfLoopBody);
mlir::scf::YieldOp::create(rewriter, resultOp->getLoc(), results);
}
doLoopOp.getInductionVar().replaceAllUsesWith(iv);
- rewriter.replaceAllUsesWith(doLoopOp.getRegionIterArgs(),
- hasFinalValue
- ? scfForOp.getRegionIterArgs().drop_front()
- : scfForOp.getRegionIterArgs());
-
- // Copy all the attributes from the old to new op.
- scfForOp->setAttrs(doLoopOp->getAttrs());
- rewriter.replaceOp(doLoopOp, scfForOp);
+ rewriter.replaceAllUsesWith(
+ doLoopOp.getRegionIterArgs(),
+ hasFinalValue ? scfLoopLikeOp.getRegionIterArgs().drop_front()
+ : scfLoopLikeOp.getRegionIterArgs());
+
+ // Copy loop annotations from the fir.do_loop to scf loop op.
+ if (auto ann = doLoopOp.getLoopAnnotation())
+ scfLoopOp->setAttr("loop_annotation", *ann);
+
+ rewriter.replaceOp(doLoopOp, scfLoopOp);
return mlir::success();
}
};
diff --git a/flang/test/Fir/FirToSCF/do-loop.fir b/flang/test/Fir/FirToSCF/do-loop.fir
index 812497c8d0c74..aa8526febeefc 100644
--- a/flang/test/Fir/FirToSCF/do-loop.fir
+++ b/flang/test/Fir/FirToSCF/do-loop.fir
@@ -1,4 +1,4 @@
-// RUN: fir-opt %s --fir-to-scf | FileCheck %s
+// RUN: fir-opt %s --fir-to-scf --split-input-file | FileCheck %s
// CHECK-LABEL: func.func @simple_loop(
// CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>) {
@@ -31,6 +31,8 @@ func.func @simple_loop(%arg0: !fir.ref<!fir.array<100xi32>>) {
return
}
+// -----
+
// CHECK-LABEL: func.func @loop_with_negtive_step(
// CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>) {
// CHECK: %[[VAL_0:.*]] = arith.constant 100 : index
@@ -64,6 +66,8 @@ func.func @loop_with_negtive_step(%arg0: !fir.ref<!fir.array<100xi32>>) {
return
}
+// -----
+
// CHECK-LABEL: func.func @loop_with_results(
// CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>,
// CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32>) {
@@ -102,6 +106,8 @@ func.func @loop_with_results(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !fir.r
return
}
+// -----
+
// CHECK-LABEL: func.func @loop_with_final_value(
// CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>,
// CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32>) {
@@ -146,6 +152,44 @@ func.func @loop_with_final_value(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !f
return
}
+// -----
+
+// CHECK-LABEL: func.func @loop_with_unordered_attr(
+// CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>) {
+// CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
+// CHECK: %[[CONSTANT_1:.*]] = arith.constant 100 : index
+// CHECK: %[[SHAPE_0:.*]] = fir.shape %[[CONSTANT_1]] : (index) -> !fir.shape<1>
+// CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : i32
+// CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_1]], %[[CONSTANT_0]] : index
+// CHECK: %[[ADDI_0:.*]] = arith.addi %[[SUBI_0]], %[[CONSTANT_0]] : index
+// CHECK: %[[DIVSI_0:.*]] = arith.divsi %[[ADDI_0]], %[[CONSTANT_0]] : index
+// CHECK: %[[CONSTANT_3:.*]] = arith.constant 0 : index
+// CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
+// CHECK: scf.parallel (%[[VAL_0:.*]]) = (%[[CONSTANT_3]]) to (%[[DIVSI_0]]) step (%[[CONSTANT_4]]) {
+// CHECK: %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[CONSTANT_0]] : index
+// CHECK: %[[ADDI_1:.*]] = arith.addi %[[CONSTANT_0]], %[[MULI_0]] : index
+// CHECK: %[[ARRAY_COOR_0:.*]] = fir.array_coor %[[ARG0]](%[[SHAPE_0]]) %[[ADDI_1]] : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK: fir.store %[[CONSTANT_2]] to %[[ARRAY_COOR_0]] : !fir.ref<i32>
+// CHECK: scf.reduce
+// CHECK: }
+// CHECK: return
+// CHECK: }
+func.func @loop_with_unordered_attr(%arg0: !fir.ref<!fir.array<100xi32>>) {
+ %c1 = arith.constant 1 : index
+ %c100 = arith.constant 100 : index
+ %0 = fir.shape %c100 : (index) -> !fir.shape<1>
+ %c1_i32 = arith.constant 1 : i32
+ fir.do_loop %arg1 = %c1 to %c100 step %c1 unordered {
+ %1 = fir.array_coor %arg0(%0) %arg1 : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+ fir.store %c1_i32 to %1 : !fir.ref<i32>
+ }
+ return
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = #llvm.loop_vectorize<disable = false>
+// CHECK: #[[$ATTR_1:.+]] = #llvm.loop_annotation<vectorize = #[[$ATTR_0]]>
// CHECK-LABEL: func.func @loop_with_attribute(
// CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100xi32>>,
// CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32>) {
@@ -167,16 +211,19 @@ func.func @loop_with_final_value(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !f
// CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
// CHECK: %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_14]] : i32
// CHECK: fir.store %[[VAL_16]] to %[[VAL_3]] : !fir.ref<i32>
-// CHECK: } {operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, reduceAttrs = [#fir.reduce_attr<add>]}
+// CHECK: } {loop_annotation = #[[$ATTR_1]]}
// CHECK: return
// CHECK: }
+
+#loop_vectorize = #llvm.loop_vectorize<disable = false>
+#loop_annotation = #llvm.loop_annotation<vectorize = #loop_vectorize>
func.func @loop_with_attribute(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !fir.ref<i32>) {
%c1 = arith.constant 1 : index
%c0_i32 = arith.constant 0 : i32
%c100 = arith.constant 100 : index
%0 = fir.alloca i32
%1 = fir.shape %c100 : (index) -> !fir.shape<1>
- fir.do_loop %arg2 = %c1 to %c100 step %c1 reduce(#fir.reduce_attr<add> -> %0 : !fir.ref<i32>) {
+ fir.do_loop %arg2 = %c1 to %c100 step %c1 attributes {loopAnnotation = #loop_annotation} {
%2 = fir.array_coor %arg0(%1) %arg2 : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
%3 = fir.load %2 : !fir.ref<i32>
%4 = fir.load %0 : !fir.ref<i32>
@@ -187,6 +234,8 @@ func.func @loop_with_attribute(%arg0: !fir.ref<!fir.array<100xi32>>, %arg1: !fir
return
}
+// -----
+
// CHECK-LABEL: func.func @nested_loop(
// CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<100x100xi32>>) {
// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index
|
🐧 Linux x64 Test Results
|
tblah
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would have thought that this would need some work to decide whether parallelising the loop is going to be profitable, but if it works for your use case that's fine by me (as this pass is not enabled by default).
Executing the loop body in parallel is a stronger requirement on the loop body than merely out of order. A loop executed sequentially but out of order still can't have race conditions and can use the same memory address for each variable for each loop iteration. A loop running in parallel may need thread private allocations of variables in order to produce correct results, even if there are no memory dependencies between each iteration. Plus side effects might not be safe to run in parallel. Two examples:
integer :: tmp
integer :: i
integer, dimension(32) :: results
! Flang does not currently add the unordered attribute but it would not be incorrect
! because executing this sequentially in any order will produce the same result.
! However, if this were executed in parallel, multiple threads could try to read/write to
! tmp at the same time.
do i = 1,32
tmp = i * 3
results(i) = tmp * (tmp + 1)
end do
subroutine printMsg
print *,"hello"
end subroutine
subroutine printMsg100Times
integer :: i
! Flang again would not currently add the unordered attribute but it would not be
! incorrect to do so. The loop body is invariant between iterations so there is no
! observable difference from executing this loop out of order. However, the side effects
! of the loop body may not be safe to run concurrently. In this case, if the text output
! was flushed after each character, and multiple threads wrote characters at the same
! time, the output could be jumpbled e.g. "hehhehloolloo"
do i=1,100
call printMsg()
end do
end subroutine
I'm worried that adding a potentially unsound transformation like this (even if it works most of the time for the loops that flang would actually apply unordered to) could limit the usefulness of this pass.
One solution would be to continue to transform these loops to scf.for by default and add a pass option which enables this transformation to scf.parallel. You can set that option when you create your custom pipeline using this pass.
Thanks for the explanation. I agree with controlling it via an option. |
|
ping. |
tblah
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Sorry for the delayed response.
Refines the existing conversion to allow
fir.do_loopannotated withunorderedto be lowered toscf.parallel, while other loops retain their original lowering.