diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h index 745c8686457f4..15b53d3b3aae9 100644 --- a/flang/include/flang/Lower/OpenACC.h +++ b/flang/include/flang/Lower/OpenACC.h @@ -110,6 +110,12 @@ getCollapseSizeAndForce(const Fortran::parser::AccClauseList &); /// Checks whether the current insertion point is inside OpenACC loop. bool isInOpenACCLoop(fir::FirOpBuilder &); +/// Checks whether the current insertion point is inside a collapsed acc.loop +/// and there are still collapsed dimensions to consume (i.e., this do-loop +/// should not generate a separate loop because the parent already distributes +/// this dimension). +bool isInsideCollapsedACCLoop(fir::FirOpBuilder &); + /// Checks whether the current insertion point is inside OpenACC compute /// construct. bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &); diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 63867fab57678..5bd862c74661b 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2493,6 +2493,18 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::lower::pft::Evaluation &eval = getEval(); bool unstructuredContext = eval.lowerAsUnstructured(); + // If this tightly-nested do-loop is inside a collapsed acc.loop and + // its dimension is already covered by the parent's control variables, + // skip generating any loop — just lower the body. The IV value is + // already available from the parent acc.loop's block argument (stored + // into the private variable by buildACCLoopOp). + if (Fortran::lower::isInsideCollapsedACCLoop(*builder)) { + auto iter = eval.getNestedEvaluations().begin(); + for (auto end = --eval.getNestedEvaluations().end(); iter != end; ++iter) + genFIR(*iter, unstructuredContext); + return; + } + // Loops with induction variables inside OpenACC compute constructs // need special handling to ensure that the IVs are privatized. if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) { diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 2154f38dca568..e181f2afc2290 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -4546,6 +4546,22 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) { return false; } +bool Fortran::lower::isInsideCollapsedACCLoop(fir::FirOpBuilder &builder) { + auto parentLoop = + builder.getBlock()->getParent()->getParentOfType(); + if (!parentLoop) + return false; + unsigned numParentIVs = parentLoop.getBody().getNumArguments(); + if (numParentIVs <= 1) + return false; + + // Check that there are still collapsed dimensions to consume: + // no inner acc.loop has been generated yet. + unsigned innerAccLoops = 0; + parentLoop.getBody().walk([&](mlir::acc::LoopOp) { ++innerAccLoops; }); + return (innerAccLoops + 1 < numParentIVs); +} + bool Fortran::lower::isInsideOpenACCComputeConstruct( fir::FirOpBuilder &builder) { return mlir::isa_and_nonnull( diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 index ca932c1b159ba..006f613dd2109 100644 --- a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 +++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 @@ -1,7 +1,7 @@ ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s ! Verify collapse(force:2) sinks prologue (between loops) and epilogue (after inner loop) -! into the acc.loop region body. +! into the acc.loop region body without generating a redundant inner acc.loop. subroutine collapse_force_sink(n, m) integer, intent(in) :: n, m @@ -22,20 +22,18 @@ subroutine collapse_force_sink(n, m) ! CHECK: func.func @_QPcollapse_force_sink( ! CHECK: acc.parallel -! Ensure outer acc.loop is combined(parallel) +! Ensure outer acc.loop is combined(parallel) with 2 IVs and no inner acc.loop ! CHECK: acc.loop combined(parallel) -! Prologue: constant 4.2 and an assign before inner loop +! Prologue: constant 4.2 and an assign before inner loop body ! CHECK: arith.constant 4.200000e+00 ! CHECK: hlfir.assign -! Inner loop and its body include 2.0 add and an assign -! CHECK: acc.loop +! Inner loop body (no acc.loop wrapping it): 2.0 add and an assign ! CHECK: arith.constant 2.000000e+00 ! CHECK: arith.addf ! CHECK: hlfir.assign -! Epilogue: constant 7.3 and an assign after inner loop +! Epilogue: constant 7.3 and an assign after inner loop body ! CHECK: arith.constant 7.300000e+00 ! CHECK: hlfir.assign -! And the outer acc.loop has collapse = [2] +! The acc.loop has collapse = [2] and no inner acc.loop ! CHECK: } attributes {collapse = [2] - - +! CHECK-NOT: acc.loop diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-collapse-reduction.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-collapse-reduction.mlir new file mode 100644 index 0000000000000..5d2f01362f3b0 --- /dev/null +++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-collapse-reduction.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s + +// Test that a collapsed acc.loop with 2 IVs converts correctly to +// scf.parallel with 2 dimensions and no redundant inner loops. + +// CHECK-LABEL: func.func @collapse_two_ivs +// CHECK: acc.compute_region +// CHECK: scf.parallel +// CHECK-NOT: scf.for +// CHECK: scf.reduce +func.func @collapse_two_ivs(%buf: memref<1xf32>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + %c200 = arith.constant 200 : index + + %dev = acc.copyin varPtr(%buf : memref<1xf32>) -> memref<1xf32> + acc.parallel dataOperands(%dev : memref<1xf32>) { + acc.loop combined(parallel) control(%i : index, %j : index) = (%c1, %c1 : index, index) to (%c100, %c200 : index, index) step (%c1, %c1 : index, index) { + %vi = arith.index_cast %i : index to i32 + %vj = arith.index_cast %j : index to i32 + %sum = arith.addi %vi, %vj : i32 + %valf = arith.sitofp %sum : i32 to f32 + memref.store %valf, %dev[%c0] : memref<1xf32> + acc.yield + } attributes {collapse = [2], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array, independent = [#acc.device_type]} + acc.yield + } + acc.copyout accPtr(%dev : memref<1xf32>) to varPtr(%buf : memref<1xf32>) + return +}