diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td index c82eae154d31a..4aee66f83b493 100644 --- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td +++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td @@ -75,7 +75,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments, func.func @foo(%arg0: !fir.ref>>, %arg1: !fir.ref) { %c10 = arith.constant 10 : index %c20 = arith.constant 20 : index - %1 = fir.load %ag1 : fir.ref + %1 = fir.load %arg1 : fir.ref %2 = fir.shape_shift %c10, %1, %c20, %1 : (index, index, index, index) -> !fir.shapeshift<2> %3 = hfir.declare %arg0(%2) typeparams %1 {uniq_name = "c"} (fir.ref>>, fir.shapeshift<2>, index) -> (fir.box>>, fir.ref>>) // ... uses %3#0 as "c" diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index e1d22c8c986da..25a526ab0cbfc 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -96,6 +96,8 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath, bool noNaNsFPMath, bool approxFuncFPMath, bool noSignedZerosFPMath, bool unsafeFPMath); +std::unique_ptr createDoConcurrentConversionPass(); + // declarative passes #define GEN_PASS_REGISTRATION #include "flang/Optimizer/Transforms/Passes.h.inc" diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 5fb576fd87625..06de4a1d28a92 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -397,4 +397,24 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> { let constructor = "::fir::createFunctionAttrPass()"; } +def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> { + let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops."; + + let description = [{ This is an experimental pass to map `DO CONCURRENR` loops + to their correspnding equivalent OpenMP worksharing constructs. + + For now the following is supported: + - Mapping simple loops to `parallel do`. + + Still to TODO: + - More extensive testing. + - Mapping to `target teams distribute parallel do`. + - Allowing the user to control mapping behavior: either to the host or + target. + }]; + + let constructor = "::fir::createDoConcurrentConversionPass()"; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index ba2e267996150..cf83bb496bb5e 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -22,6 +22,7 @@ add_flang_library(FIRTransforms OMPMarkDeclareTarget.cpp VScaleAttr.cpp FunctionAttr.cpp + DoConcurrentConversion.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp new file mode 100644 index 0000000000000..e7b223aec8ea2 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -0,0 +1,205 @@ +//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Dialect/Support/FIRContext.h" +#include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" + +#include + +namespace fir { +#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "fopenmp-do-concurrent-conversion" + +namespace { +class DoConcurrentConversion : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + mlir::OpPrintingFlags flags; + flags.printGenericOpForm(); + + mlir::omp::ParallelOp parallelOp = + rewriter.create(doLoop.getLoc()); + + mlir::Block *block = rewriter.createBlock(¶llelOp.getRegion()); + + rewriter.setInsertionPointToEnd(block); + rewriter.create(doLoop.getLoc()); + + rewriter.setInsertionPointToStart(block); + + // ==== TODO (1) Start ==== + // + // The goal of the few lines below is to collect and clone + // the list of operations that define the loop's lower and upper bounds as + // well as the step. Should we, instead of doing this here, split it into 2 + // stages? + // + // 1. **Stage 1**: add an analysis that extracts all the relevant + // operations defining the lower-bound, upper-bound, and + // step. + // 2. **Stage 2**: clone the collected operations in the parallel region. + // + // So far, the pass has been tested with very simple loops (where the bounds + // and step are constants) so the goal of **Stage 1** is to have a + // well-defined component that has the sole responsibility of collecting all + // the relevant ops relevant to the loop header. This was we can test this + // in isolation for more complex loops and better organize the code. **Stage + // 2** would then be responsible for the actual cloning of the collected + // loop header preparation/allocation operations. + + // Clone the LB, UB, step defining ops inside the parallel region. + llvm::SmallVector lowerBound, upperBound, step; + lowerBound.push_back( + rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0)); + upperBound.push_back( + rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0)); + step.push_back( + rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0)); + // ==== TODO (1) End ==== + + auto wsLoopOp = rewriter.create( + doLoop.getLoc(), lowerBound, upperBound, step); + wsLoopOp.setInclusive(true); + + auto outlineableOp = + mlir::dyn_cast(*parallelOp); + rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock()); + + // ==== TODO (2) Start ==== + // + // The goal of the following simple work-list algorithm and + // the following `for` loop is to collect all the operations related to the + // allocation of the induction variable for the `do concurrent` loop. The + // operations collected by this algorithm are very similar to what is + // usually emitted for privatized variables, e.g. for omp.parallel loops. + // Therefore, I think we can: + // + // 1. **Stage 1**: Add an analysis that colects all these operations. The + // goal is similar to **Stage 1** of TODO (1): isolate the + // algorithm is an individually-testable component so that + // we properly implement and test it for more complicated + // `do concurrent` loops. + // 1. **Stage 2**: Using the collected operations, create and populate an + // `omp.private {type=private}` op to server as the + // delayed privatizer for the new work-sharing loop. + + // For the induction variable, we need to privative its allocation and + // binding inside the parallel region. + llvm::SmallSetVector workList; + // Therefore, we first discover the induction variable by discovering + // `fir.store`s where the source is the loop's block argument. + workList.insert(doLoop.getInductionVar().getUsers().begin(), + doLoop.getInductionVar().getUsers().end()); + llvm::SmallSetVector inductionVarTargetStores; + + // Walk the def-chain of the loop's block argument until we hit `fir.store`. + while (!workList.empty()) { + mlir::Operation *item = workList.front(); + + if (auto storeOp = mlir::dyn_cast(item)) { + inductionVarTargetStores.insert(storeOp); + } else { + workList.insert(item->getUsers().begin(), item->getUsers().end()); + } + + workList.remove(item); + } + + // For each collected `fir.sotre`, find the target memref's alloca's and + // declare ops. + llvm::SmallSetVector declareAndAllocasToClone; + for (auto storeOp : inductionVarTargetStores) { + mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp(); + + for (auto operand : storeTarget->getOperands()) { + declareAndAllocasToClone.insert(operand.getDefiningOp()); + } + declareAndAllocasToClone.insert(storeTarget); + } + // ==== TODO (2) End ==== + // + // TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can + // more easily generalize the pass to work for targets other than OpenMP, + // e.g. OpenACC, I think can, can reuse the results of the analyses and only + // change the code-gen/rewriting. + + mlir::IRMapping mapper; + + // Collect the memref defining ops in the parallel region. + for (mlir::Operation *opToClone : declareAndAllocasToClone) { + rewriter.clone(*opToClone, mapper); + } + + // Clone the loop's body inside the worksharing construct using the mapped + // memref values. + rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(), + wsLoopOp.getRegion().begin(), mapper); + + mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator(); + rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back()); + rewriter.create(terminator->getLoc()); + rewriter.eraseOp(terminator); + + rewriter.eraseOp(doLoop); + + return mlir::success(); + } +}; + +class DoConcurrentConversionPass + : public fir::impl::DoConcurrentConversionPassBase< + DoConcurrentConversionPass> { +public: + void runOnOperation() override { + mlir::func::FuncOp func = getOperation(); + + if (func.isDeclaration()) { + return; + } + + auto *context = &getContext(); + mlir::RewritePatternSet patterns(context); + patterns.insert(context); + mlir::ConversionTarget target(*context); + target.addLegalDialect(); + + target.addDynamicallyLegalOp( + [](fir::DoLoopOp op) { return !op.getUnordered(); }); + + if (mlir::failed(mlir::applyFullConversion(getOperation(), target, + std::move(patterns)))) { + mlir::emitError(mlir::UnknownLoc::get(context), + "error in converting do-concurrent op"); + signalPassFailure(); + } + } +}; +} // namespace + +std::unique_ptr fir::createDoConcurrentConversionPass() { + return std::make_unique(); +} diff --git a/flang/test/Transforms/DoConcurrent/basic.mlir b/flang/test/Transforms/DoConcurrent/basic.mlir new file mode 100644 index 0000000000000..7d62463f36d42 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/basic.mlir @@ -0,0 +1,60 @@ +// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. + +// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s + +// CHECK-LABEL: func.func @do_concurrent_basic +func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} { + // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + // CHECK: %[[C1:.*]] = arith.constant 1 : i32 + // CHECK: %[[C10:.*]] = arith.constant 10 : i32 + + %0 = fir.alloca i32 {bindc_name = "i"} + %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %2 = fir.address_of(@_QFEa) : !fir.ref> + %c10 = arith.constant 10 : index + %3 = fir.shape %c10 : (index) -> !fir.shape<1> + %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c1_i32 = arith.constant 1 : i32 + %7 = fir.convert %c1_i32 : (i32) -> index + %c10_i32 = arith.constant 10 : i32 + %8 = fir.convert %c10_i32 : (i32) -> index + %c1 = arith.constant 1 : index + + // CHECK-NOT: fir.do_loop + + // CHECK: omp.parallel { + + // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} + // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index + // CHECK: %[[STEP:.*]] = arith.constant 1 : index + + // CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + // CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32 + // CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref + // CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + // CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + // CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64 + // CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref>, i64) -> !fir.ref + // CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref + // CHECK-NEXT: omp.yield + // CHECK-NEXT: } + + // CHECK-NEXT: omp.terminator + // CHECK-NEXT: } + fir.do_loop %arg0 = %7 to %8 step %c1 unordered { + %13 = fir.convert %arg0 : (index) -> i32 + fir.store %13 to %1#1 : !fir.ref + %14 = fir.load %1#0 : !fir.ref + %15 = fir.load %1#0 : !fir.ref + %16 = fir.convert %15 : (i32) -> i64 + %17 = hlfir.designate %4#0 (%16) : (!fir.ref>, i64) -> !fir.ref + hlfir.assign %14 to %17 : i32, !fir.ref + } + + // CHECK-NOT: fir.do_loop + + return + }