From b24a7a72598d303a2e3a2ca4f61a7b1a0a744fa4 Mon Sep 17 00:00:00 2001 From: Ferran Toda Date: Thu, 20 Nov 2025 02:45:10 +0000 Subject: [PATCH] lower loop fuse --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 1 + flang/lib/Lower/OpenMP/Clauses.cpp | 5 +- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 3 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 77 ++++++++-- flang/lib/Lower/OpenMP/Utils.cpp | 28 ++-- flang/lib/Lower/OpenMP/Utils.h | 6 +- flang/test/Lower/OpenMP/fuse01.f90 | 93 ++++++++++++ flang/test/Lower/OpenMP/fuse02.f90 | 123 +++++++++++++++ .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 53 +++++++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 111 ++++++++++++++ mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 34 +++++ mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 68 +++++++++ .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 54 +++++++ mlir/test/Dialect/OpenMP/cli-fuse.mlir | 114 ++++++++++++++ mlir/test/Dialect/OpenMP/invalid-fuse.mlir | 100 +++++++++++++ .../test/Target/LLVMIR/openmp-cli-fuse01.mlir | 100 +++++++++++++ .../test/Target/LLVMIR/openmp-cli-fuse02.mlir | 140 ++++++++++++++++++ .../test/transform/fuse/do-looprange.f90 | 60 ++++++++ openmp/runtime/test/transform/fuse/do.f90 | 52 +++++++ 19 files changed, 1194 insertions(+), 28 deletions(-) create mode 100644 flang/test/Lower/OpenMP/fuse01.f90 create mode 100644 flang/test/Lower/OpenMP/fuse02.f90 create mode 100644 mlir/test/Dialect/OpenMP/cli-fuse.mlir create mode 100644 mlir/test/Dialect/OpenMP/invalid-fuse.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir create mode 100644 openmp/runtime/test/transform/fuse/do-looprange.f90 create mode 100644 openmp/runtime/test/transform/fuse/do.f90 diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 4a392381287d5..ab3a174c7ad69 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -279,6 +279,7 @@ bool ClauseProcessor::processCollapse( llvm::SmallVectorImpl &iv) const { int64_t numCollapse = collectLoopRelatedInfo(converter, currentLocation, eval, + eval.getFirstNestedEvaluation(), clauses, loopResult, iv); fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); collapseResult.collapseNumLoops = firOpBuilder.getI64IntegerAttr(numCollapse); diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index b1a3c3d3c5439..f2defc62dce91 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -1063,7 +1063,10 @@ Link make(const parser::OmpClause::Link &inp, LoopRange make(const parser::OmpClause::Looprange &inp, semantics::SemanticsContext &semaCtx) { - llvm_unreachable("Unimplemented: looprange"); + auto &t0 = std::get<0>(inp.v.t); + auto &t1 = std::get<1>(inp.v.t); + return LoopRange{{/*First*/ makeExpr(t0, semaCtx), + /*Count*/ makeExpr(t1, semaCtx)}}; } Map make(const parser::OmpClause::Map &inp, diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index 83c2eda0a2dc7..da9480123513f 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -347,7 +347,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { mlir::omp::LoopRelatedClauseOps result; llvm::SmallVector iv; collectLoopRelatedInfo(converter, converter.getCurrentLocation(), eval, - clauses, result, iv); + eval.getFirstNestedEvaluation(), clauses, result, + iv); // Update the original variable just before exiting the worksharing // loop. Conversion as follows: diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index b11a1a14db066..5a31443f4eeee 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1982,9 +1982,9 @@ genLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable, static void genCanonicalLoopNest( lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - mlir::Location loc, const ConstructQueue &queue, - ConstructQueue::const_iterator item, size_t numLoops, - llvm::SmallVectorImpl &loops) { + lower::pft::Evaluation &nestedEval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item, + size_t numLoops, llvm::SmallVectorImpl &loops) { assert(loops.empty() && "Expecting empty list to fill"); assert(numLoops >= 1 && "Expecting at least one loop"); @@ -1992,7 +1992,8 @@ static void genCanonicalLoopNest( mlir::omp::LoopRelatedClauseOps loopInfo; llvm::SmallVector ivs; - collectLoopRelatedInfo(converter, loc, eval, numLoops, loopInfo, ivs); + collectLoopRelatedInfo(converter, loc, eval, nestedEval, numLoops, loopInfo, + ivs); assert(ivs.size() == numLoops && "Expected to parse as many loop variables as there are loops"); @@ -2014,7 +2015,7 @@ static void genCanonicalLoopNest( // Step 1: Loop prologues // Computing the trip count must happen before entering the outermost loop - lower::pft::Evaluation *innermostEval = &eval.getFirstNestedEvaluation(); + lower::pft::Evaluation *innermostEval = &nestedEval; for ([[maybe_unused]] auto iv : ivs) { if (innermostEval->getIf()->IsDoConcurrent()) { // OpenMP specifies DO CONCURRENT only with the `!omp loop` construct. @@ -2186,7 +2187,8 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter, llvm::SmallVector canonLoops; canonLoops.reserve(numLoops); - genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item, + genCanonicalLoopNest(converter, symTable, semaCtx, eval, + eval.getFirstNestedEvaluation(), loc, queue, item, numLoops, canonLoops); assert((canonLoops.size() == numLoops) && "Expecting the predetermined number of loops"); @@ -2217,6 +2219,58 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter, sizesClause.sizes); } +static void genFuseOp(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + lower::StatementContext &stmtCtx, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, + ConstructQueue::const_iterator item) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + + int32_t first = 0; + int32_t count = 0; + auto iter = llvm::find_if(item->clauses, [](const Clause &clause) { + return clause.id == llvm::omp::Clause::OMPC_looprange; + }); + if (iter != item->clauses.end()) { + const auto &looprange = std::get(iter->u); + first = evaluate::ToInt64(std::get<0>(looprange.t)).value(); + count = evaluate::ToInt64(std::get<1>(looprange.t)).value(); + } + + llvm::SmallVector applyees; + for (auto &child : eval.getNestedEvaluations()) { + // Skip OmpEndLoopDirective + if (&child == &eval.getLastNestedEvaluation()) + break; + + // Emit the associated loop + llvm::SmallVector canonLoops; + genCanonicalLoopNest(converter, symTable, semaCtx, eval, child, loc, queue, + item, 1, canonLoops); + + auto cli = llvm::getSingleElement(canonLoops).getCli(); + applyees.push_back(cli); + } + // One generated loop + one for each loop not inside the specified looprange + // if present + llvm::SmallVector generatees; + int64_t numGeneratees = count == 0 ? 1 : applyees.size() - count + 1; + for (int i = 0; i < numGeneratees; i++) { + auto fusedCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc); + generatees.push_back(fusedCLI); + } + auto op = mlir::omp::FuseOp::create(firOpBuilder, loc, generatees, applyees); + + if (count != 0) { + mlir::IntegerAttr firstAttr = firOpBuilder.getI32IntegerAttr(first); + mlir::IntegerAttr countAttr = firOpBuilder.getI32IntegerAttr(count); + op->setAttr("first", firstAttr); + op->setAttr("count", countAttr); + } +} + static void genUnrollOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::SymMap &symTable, lower::StatementContext &stmtCtx, @@ -2233,7 +2287,8 @@ static void genUnrollOp(Fortran::lower::AbstractConverter &converter, // Emit the associated loop llvm::SmallVector canonLoops; - genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item, 1, + genCanonicalLoopNest(converter, symTable, semaCtx, eval, + eval.getFirstNestedEvaluation(), loc, queue, item, 1, canonLoops); llvm::SmallVector applyees; @@ -3507,13 +3562,9 @@ static void genOMPDispatch(lower::AbstractConverter &converter, case llvm::omp::Directive::OMPD_tile: genTileOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item); break; - case llvm::omp::Directive::OMPD_fuse: { - unsigned version = semaCtx.langOptions().OpenMPVersion; - if (!semaCtx.langOptions().OpenMPSimd) - TODO(loc, "Unhandled loop directive (" + - llvm::omp::getOpenMPDirectiveName(dir, version) + ")"); + case llvm::omp::Directive::OMPD_fuse: + genFuseOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item); break; - } case llvm::omp::Directive::OMPD_unroll: genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item); break; diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index 7d7a4869ab3a6..913e4d1e69500 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -812,13 +812,14 @@ void collectTileSizesFromOpenMPConstruct( int64_t collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, const omp::List &clauses, + lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval, + const omp::List &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv) { int64_t numCollapse = 1; // Collect the loops to collapse. - lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation(); + lower::pft::Evaluation *doConstructEval = &nestedEval; if (doConstructEval->getIf()->IsDoConcurrent()) { TODO(currentLocation, "Do Concurrent in Worksharing loop construct"); } @@ -830,21 +831,21 @@ int64_t collectLoopRelatedInfo( numCollapse = collapseValue; } - collectLoopRelatedInfo(converter, currentLocation, eval, numCollapse, result, - iv); + collectLoopRelatedInfo(converter, currentLocation, eval, nestedEval, + numCollapse, result, iv); return numCollapse; } void collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, int64_t numCollapse, - mlir::omp::LoopRelatedClauseOps &result, + lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval, + int64_t numCollapse, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); // Collect the loops to collapse. - lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation(); + lower::pft::Evaluation *doConstructEval = &nestedEval; if (doConstructEval->getIf()->IsDoConcurrent()) { TODO(currentLocation, "Do Concurrent in Worksharing loop construct"); } @@ -852,10 +853,15 @@ void collectLoopRelatedInfo( // Collect sizes from tile directive if present. std::int64_t sizesLengthValue = 0l; if (auto *ompCons{eval.getIf()}) { - processTileSizesFromOpenMPConstruct( - ompCons, [&](const parser::OmpClause::Sizes *tclause) { - sizesLengthValue = tclause->v.size(); - }); + if (auto *ompLoop{std::get_if(&ompCons->u)}) { + const parser::OmpDirectiveSpecification &beginSpec{ompLoop->BeginDir()}; + if (beginSpec.DirId() == llvm::omp::Directive::OMPD_tile) { + processTileSizesFromOpenMPConstruct( + ompCons, [&](const parser::OmpClause::Sizes *tclause) { + sizesLengthValue = tclause->v.size(); + }); + } + } } std::int64_t collapseValue = std::max(numCollapse, sizesLengthValue); diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h index 2960b663b08b2..886a5c1835f7e 100644 --- a/flang/lib/Lower/OpenMP/Utils.h +++ b/flang/lib/Lower/OpenMP/Utils.h @@ -169,13 +169,15 @@ void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp, int64_t collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, const omp::List &clauses, + lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval, + const omp::List &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv); void collectLoopRelatedInfo( lower::AbstractConverter &converter, mlir::Location currentLocation, - lower::pft::Evaluation &eval, std::int64_t collapseValue, + lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval, + std::int64_t collapseValue, // const omp::List &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv); diff --git a/flang/test/Lower/OpenMP/fuse01.f90 b/flang/test/Lower/OpenMP/fuse01.f90 new file mode 100644 index 0000000000000..1377bf3e9c529 --- /dev/null +++ b/flang/test/Lower/OpenMP/fuse01.f90 @@ -0,0 +1,93 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 -o - %s | FileCheck %s + + +subroutine omp_fuse01(lb1, ub1, inc1, lb2, ub2, inc2) + integer res, i, j + integer lb1, ub1, inc1 + integer lb2, ub2, inc2 + + !$omp fuse + do i = lb1, ub1, inc1 + res = i + end do + do j = lb2, ub2, inc2 + res = j + end do + !$omp end fuse + +end subroutine omp_fuse01 + + +! CHECK-LABEL: func.func @_QPomp_fuse01( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "lb1"}, +! CHECK-SAME: %[[ARG1:.*]]: !fir.ref {fir.bindc_name = "ub1"}, +! CHECK-SAME: %[[ARG2:.*]]: !fir.ref {fir.bindc_name = "inc1"}, +! CHECK-SAME: %[[ARG3:.*]]: !fir.ref {fir.bindc_name = "lb2"}, +! CHECK-SAME: %[[ARG4:.*]]: !fir.ref {fir.bindc_name = "ub2"}, +! CHECK-SAME: %[[ARG5:.*]]: !fir.ref {fir.bindc_name = "inc2"}) { +! CHECK: %[[DUMMY_SCOPE_0:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_fuse01Ei"} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "_QFomp_fuse01Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Einc1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_2:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Einc2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_fuse01Ej"} +! CHECK: %[[DECLARE_3:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFomp_fuse01Ej"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Elb1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_5:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Elb2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_2:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_fuse01Eres"} +! CHECK: %[[DECLARE_6:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFomp_fuse01Eres"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Eub1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_8:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Eub2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DECLARE_4]]#0 : !fir.ref +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DECLARE_7]]#0 : !fir.ref +! CHECK: %[[LOAD_2:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_2]], %[[CONSTANT_0]] : i32 +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[SUBI_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_2:.*]] = arith.select %[[CMPI_0]], %[[LOAD_0]], %[[LOAD_1]] : i32 +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[SELECT_2]], %[[SELECT_1]] overflow : i32 +! CHECK: %[[DIVUI_0:.*]] = arith.divui %[[SUBI_1]], %[[SELECT_0]] : i32 +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[DIVUI_0]], %[[CONSTANT_1]] overflow : i32 +! CHECK: %[[CMPI_1:.*]] = arith.cmpi slt, %[[SELECT_2]], %[[SELECT_1]] : i32 +! CHECK: %[[SELECT_3:.*]] = arith.select %[[CMPI_1]], %[[CONSTANT_0]], %[[ADDI_0]] : i32 +! CHECK: %[[NEW_CLI_0:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_0]]) %[[VAL_0:.*]] : i32 in range(%[[SELECT_3]]) { +! CHECK: %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[LOAD_2]] : i32 +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_0]], %[[MULI_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_1]] to %[[DECLARE_0]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_3:.*]] = fir.load %[[DECLARE_0]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_3]] to %[[DECLARE_6]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[LOAD_4:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref +! CHECK: %[[LOAD_5:.*]] = fir.load %[[DECLARE_8]]#0 : !fir.ref +! CHECK: %[[LOAD_6:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %[[LOAD_6]], %[[CONSTANT_2]] : i32 +! CHECK: %[[SUBI_2:.*]] = arith.subi %[[CONSTANT_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_4:.*]] = arith.select %[[CMPI_2]], %[[SUBI_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_5:.*]] = arith.select %[[CMPI_2]], %[[LOAD_5]], %[[LOAD_4]] : i32 +! CHECK: %[[SELECT_6:.*]] = arith.select %[[CMPI_2]], %[[LOAD_4]], %[[LOAD_5]] : i32 +! CHECK: %[[SUBI_3:.*]] = arith.subi %[[SELECT_6]], %[[SELECT_5]] overflow : i32 +! CHECK: %[[DIVUI_1:.*]] = arith.divui %[[SUBI_3]], %[[SELECT_4]] : i32 +! CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_1]], %[[CONSTANT_3]] overflow : i32 +! CHECK: %[[CMPI_3:.*]] = arith.cmpi slt, %[[SELECT_6]], %[[SELECT_5]] : i32 +! CHECK: %[[SELECT_7:.*]] = arith.select %[[CMPI_3]], %[[CONSTANT_2]], %[[ADDI_2]] : i32 +! CHECK: %[[NEW_CLI_1:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_1]]) %[[VAL_1:.*]] : i32 in range(%[[SELECT_7]]) { +! CHECK: %[[MULI_1:.*]] = arith.muli %[[VAL_1]], %[[LOAD_6]] : i32 +! CHECK: %[[ADDI_3:.*]] = arith.addi %[[LOAD_4]], %[[MULI_1]] : i32 +! CHECK: hlfir.assign %[[ADDI_3]] to %[[DECLARE_3]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_7:.*]] = fir.load %[[DECLARE_3]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_7]] to %[[DECLARE_6]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[NEW_CLI_2:.*]] = omp.new_cli +! CHECK: omp.fuse (%[[NEW_CLI_2]]) <- (%[[NEW_CLI_0]], %[[NEW_CLI_1]]) +! CHECK: return +! CHECK: } + diff --git a/flang/test/Lower/OpenMP/fuse02.f90 b/flang/test/Lower/OpenMP/fuse02.f90 new file mode 100644 index 0000000000000..5a0f37827c36a --- /dev/null +++ b/flang/test/Lower/OpenMP/fuse02.f90 @@ -0,0 +1,123 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 -o - %s | FileCheck %s + + +subroutine omp_fuse02(lb1, ub1, inc1, lb2, ub2, inc2) + integer res, i, j, k + integer lb1, ub1, inc1 + integer lb2, ub2, inc2 + + !$omp fuse looprange(2,2) + do i = lb1, ub1, inc1 + res = i + end do + do j = lb2, ub2, inc2 + res = j + end do + do k = lb1, ub2, inc1 + res = k + end do + !$omp end fuse + +end subroutine omp_fuse02 + + +! CHECK-LABEL: func.func @_QPomp_fuse02( +! CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "lb1"}, +! CHECK-SAME: %[[ARG1:.*]]: !fir.ref {fir.bindc_name = "ub1"}, +! CHECK-SAME: %[[ARG2:.*]]: !fir.ref {fir.bindc_name = "inc1"}, +! CHECK-SAME: %[[ARG3:.*]]: !fir.ref {fir.bindc_name = "lb2"}, +! CHECK-SAME: %[[ARG4:.*]]: !fir.ref {fir.bindc_name = "ub2"}, +! CHECK-SAME: %[[ARG5:.*]]: !fir.ref {fir.bindc_name = "inc2"}) { +! CHECK: %[[DUMMY_SCOPE_0:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_fuse02Ei"} +! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "_QFomp_fuse02Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Einc1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_2:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Einc2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_fuse02Ej"} +! CHECK: %[[DECLARE_3:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFomp_fuse02Ej"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_2:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFomp_fuse02Ek"} +! CHECK: %[[DECLARE_4:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFomp_fuse02Ek"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Elb1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_6:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Elb2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_3:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_fuse02Eres"} +! CHECK: %[[DECLARE_7:.*]]:2 = hlfir.declare %[[ALLOCA_3]] {uniq_name = "_QFomp_fuse02Eres"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_8:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Eub1"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLARE_9:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Eub2"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +! CHECK: %[[LOAD_0:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref +! CHECK: %[[LOAD_1:.*]] = fir.load %[[DECLARE_8]]#0 : !fir.ref +! CHECK: %[[LOAD_2:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref +! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_2]], %[[CONSTANT_0]] : i32 +! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[SUBI_0]], %[[LOAD_2]] : i32 +! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32 +! CHECK: %[[SELECT_2:.*]] = arith.select %[[CMPI_0]], %[[LOAD_0]], %[[LOAD_1]] : i32 +! CHECK: %[[SUBI_1:.*]] = arith.subi %[[SELECT_2]], %[[SELECT_1]] overflow : i32 +! CHECK: %[[DIVUI_0:.*]] = arith.divui %[[SUBI_1]], %[[SELECT_0]] : i32 +! CHECK: %[[ADDI_0:.*]] = arith.addi %[[DIVUI_0]], %[[CONSTANT_1]] overflow : i32 +! CHECK: %[[CMPI_1:.*]] = arith.cmpi slt, %[[SELECT_2]], %[[SELECT_1]] : i32 +! CHECK: %[[SELECT_3:.*]] = arith.select %[[CMPI_1]], %[[CONSTANT_0]], %[[ADDI_0]] : i32 +! CHECK: %[[NEW_CLI_0:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_0]]) %[[VAL_0:.*]] : i32 in range(%[[SELECT_3]]) { +! CHECK: %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[LOAD_2]] : i32 +! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_0]], %[[MULI_0]] : i32 +! CHECK: hlfir.assign %[[ADDI_1]] to %[[DECLARE_0]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_3:.*]] = fir.load %[[DECLARE_0]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_3]] to %[[DECLARE_7]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[LOAD_4:.*]] = fir.load %[[DECLARE_6]]#0 : !fir.ref +! CHECK: %[[LOAD_5:.*]] = fir.load %[[DECLARE_9]]#0 : !fir.ref +! CHECK: %[[LOAD_6:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref +! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %[[LOAD_6]], %[[CONSTANT_2]] : i32 +! CHECK: %[[SUBI_2:.*]] = arith.subi %[[CONSTANT_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_4:.*]] = arith.select %[[CMPI_2]], %[[SUBI_2]], %[[LOAD_6]] : i32 +! CHECK: %[[SELECT_5:.*]] = arith.select %[[CMPI_2]], %[[LOAD_5]], %[[LOAD_4]] : i32 +! CHECK: %[[SELECT_6:.*]] = arith.select %[[CMPI_2]], %[[LOAD_4]], %[[LOAD_5]] : i32 +! CHECK: %[[SUBI_3:.*]] = arith.subi %[[SELECT_6]], %[[SELECT_5]] overflow : i32 +! CHECK: %[[DIVUI_1:.*]] = arith.divui %[[SUBI_3]], %[[SELECT_4]] : i32 +! CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_1]], %[[CONSTANT_3]] overflow : i32 +! CHECK: %[[CMPI_3:.*]] = arith.cmpi slt, %[[SELECT_6]], %[[SELECT_5]] : i32 +! CHECK: %[[SELECT_7:.*]] = arith.select %[[CMPI_3]], %[[CONSTANT_2]], %[[ADDI_2]] : i32 +! CHECK: %[[NEW_CLI_1:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_1]]) %[[VAL_1:.*]] : i32 in range(%[[SELECT_7]]) { +! CHECK: %[[MULI_1:.*]] = arith.muli %[[VAL_1]], %[[LOAD_6]] : i32 +! CHECK: %[[ADDI_3:.*]] = arith.addi %[[LOAD_4]], %[[MULI_1]] : i32 +! CHECK: hlfir.assign %[[ADDI_3]] to %[[DECLARE_3]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_7:.*]] = fir.load %[[DECLARE_3]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_7]] to %[[DECLARE_7]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[LOAD_8:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref +! CHECK: %[[LOAD_9:.*]] = fir.load %[[DECLARE_9]]#0 : !fir.ref +! CHECK: %[[LOAD_10:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref +! CHECK: %[[CONSTANT_4:.*]] = arith.constant 0 : i32 +! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : i32 +! CHECK: %[[CMPI_4:.*]] = arith.cmpi slt, %[[LOAD_10]], %[[CONSTANT_4]] : i32 +! CHECK: %[[SUBI_4:.*]] = arith.subi %[[CONSTANT_4]], %[[LOAD_10]] : i32 +! CHECK: %[[SELECT_8:.*]] = arith.select %[[CMPI_4]], %[[SUBI_4]], %[[LOAD_10]] : i32 +! CHECK: %[[SELECT_9:.*]] = arith.select %[[CMPI_4]], %[[LOAD_9]], %[[LOAD_8]] : i32 +! CHECK: %[[SELECT_10:.*]] = arith.select %[[CMPI_4]], %[[LOAD_8]], %[[LOAD_9]] : i32 +! CHECK: %[[SUBI_5:.*]] = arith.subi %[[SELECT_10]], %[[SELECT_9]] overflow : i32 +! CHECK: %[[DIVUI_2:.*]] = arith.divui %[[SUBI_5]], %[[SELECT_8]] : i32 +! CHECK: %[[ADDI_4:.*]] = arith.addi %[[DIVUI_2]], %[[CONSTANT_5]] overflow : i32 +! CHECK: %[[CMPI_5:.*]] = arith.cmpi slt, %[[SELECT_10]], %[[SELECT_9]] : i32 +! CHECK: %[[SELECT_11:.*]] = arith.select %[[CMPI_5]], %[[CONSTANT_4]], %[[ADDI_4]] : i32 +! CHECK: %[[NEW_CLI_2:.*]] = omp.new_cli +! CHECK: omp.canonical_loop(%[[NEW_CLI_2]]) %[[VAL_2:.*]] : i32 in range(%[[SELECT_11]]) { +! CHECK: %[[MULI_2:.*]] = arith.muli %[[VAL_2]], %[[LOAD_10]] : i32 +! CHECK: %[[ADDI_5:.*]] = arith.addi %[[LOAD_8]], %[[MULI_2]] : i32 +! CHECK: hlfir.assign %[[ADDI_5]] to %[[DECLARE_4]]#0 : i32, !fir.ref +! CHECK: %[[LOAD_11:.*]] = fir.load %[[DECLARE_4]]#0 : !fir.ref +! CHECK: hlfir.assign %[[LOAD_11]] to %[[DECLARE_7]]#0 : i32, !fir.ref +! CHECK: omp.terminator +! CHECK: } +! CHECK: %[[NEW_CLI_3:.*]] = omp.new_cli +! CHECK: %[[NEW_CLI_4:.*]] = omp.new_cli +! CHECK: omp.fuse (%[[NEW_CLI_3]], %[[NEW_CLI_4]]) <- (%[[NEW_CLI_0]], %[[NEW_CLI_1]], %[[NEW_CLI_2]]) {count = 2 : i32, first = 2 : i32} +! CHECK: return +! CHECK: } + diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index f864a895a1259..9073aa7afccdd 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1278,6 +1278,59 @@ class OpenMPIRBuilder { tileLoops(DebugLoc DL, ArrayRef Loops, ArrayRef TileSizes); + /// Fuse a sequence of loops. + /// + /// Fuses the loops of \p Loops. + /// The merging of the loops is done in the following structure: + /// + /// Example: + /// \code + /// for (int i = lb0; i < ub0; i += st0) // trip count is calculated as: + /// body(i) // tc0 = (ub0 - lb0 + st0) / st0 + /// for (int j = lb1; j < ub1; j += st1) + /// body(j); + /// + /// ... + /// + /// for (int k = lbk; j < ubk; j += stk) + /// body(k); + /// \endcode + /// + /// After fusing the loops a single loop is left: + /// \code + /// for (fuse.index = 0; fuse.index < max(tc0, tc1, ... tck); ++fuse.index) { + /// if (fuse.index < tc0){ + /// iv0 = lb0 + st0 * fuse.index; + /// original.index0 = iv0 + /// body(0); + /// } + /// if (fuse.index < tc1){ + /// iv1 = lb1 + st1 * fuse.index; + /// original.index1 = iv1 + /// body(1); + /// } + /// + /// ... + /// + /// if (fuse.index < tck){ + /// ivk = lbk + stk * fuse.index; + /// original.indexk = ivk + /// body(k); + /// } + /// } + /// \endcode + /// + /// + /// @param DL Debug location for instructions added by fusion. + /// + /// @param Loops Loops to fuse. The CanonicalLoopInfo objects are + /// invalidated by this method, i.e. should not used after + /// fusion. + /// + /// \returns A single loop generated by the loop fusion + LLVM_ABI CanonicalLoopInfo *fuseLoops(DebugLoc DL, + ArrayRef Loops); + /// Fully unroll a loop. /// /// Instead of unrolling the loop immediately (and duplicating its body diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5101717526263..d99575bd5f8f2 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5815,6 +5815,117 @@ static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, } } +CanonicalLoopInfo * +OpenMPIRBuilder::fuseLoops(DebugLoc DL, ArrayRef Loops) { + + CanonicalLoopInfo *firstLoop = Loops.front(); + CanonicalLoopInfo *lastLoop = Loops.back(); + Function *F = firstLoop->getPreheader()->getParent(); + + // Loop control blocks that will become orphaned later + SmallVector oldControlBBs; + for (CanonicalLoopInfo *Loop : Loops) + Loop->collectControlBlocks(oldControlBBs); + + // Collect original trip counts + SmallVector origTripCounts; + for (CanonicalLoopInfo *L : Loops) { + assert(L->isValid() && "All input loops must be valid canonical loops"); + origTripCounts.push_back(L->getTripCount()); + } + + Builder.SetCurrentDebugLocation(DL); + + // Compute max trip count. + // The fused loop will be from 0 to max(origTripCounts) + BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc", + F, firstLoop->getHeader()); + Builder.SetInsertPoint(TCBlock); + Value *fusedTripCount = nullptr; + for (CanonicalLoopInfo *L : Loops) { + assert(L->isValid() && "All loops to fuse must be valid canonical loops"); + Value *origTripCount = L->getTripCount(); + if (!fusedTripCount) { + fusedTripCount = origTripCount; + continue; + } + Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount); + fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount, + Twine(".omp.fuse.tc")); + } + + // Generate new loop + CanonicalLoopInfo *fused = + createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(), + lastLoop->getLatch(), "fused"); + + // Replace original loops with the fused loop + // Preheader and After are not considered inside the CLI. + // These are used to compute the individual TCs of the loops + // so they have to be put before the resulting fused loop. + // Moving them up for readability. + for (size_t i = 0; i < Loops.size() - 1; ++i) { + Loops[i]->getPreheader()->moveBefore(TCBlock); + Loops[i]->getAfter()->moveBefore(TCBlock); + } + lastLoop->getPreheader()->moveBefore(TCBlock); + + for (size_t i = 0; i < Loops.size() - 1; ++i) { + redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL); + redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL); + } + redirectTo(lastLoop->getPreheader(), TCBlock, DL); + redirectTo(TCBlock, fused->getPreheader(), DL); + redirectTo(fused->getAfter(), lastLoop->getAfter(), DL); + + // Build the fused body + // Create new Blocks with conditions that jump to the original loop bodies + SmallVector condBBs; + SmallVector condValues; + for (size_t i = 0; i < Loops.size(); ++i) { + BasicBlock *condBlock = BasicBlock::Create( + F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody()); + Builder.SetInsertPoint(condBlock); + Value *condValue = + Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]); + condBBs.push_back(condBlock); + condValues.push_back(condValue); + } + // Join the condition blocks with the bodies of the original loops + redirectTo(fused->getBody(), condBBs[0], DL); + for (size_t i = 0; i < Loops.size() - 1; ++i) { + Builder.SetInsertPoint(condBBs[i]); + Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]); + redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL); + // Replace the IV with the fused IV + Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar()); + } + // Last body jumps to the created end body block + Builder.SetInsertPoint(condBBs.back()); + Builder.CreateCondBr(condValues.back(), lastLoop->getBody(), + fused->getLatch()); + redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL); + // Replace the IV with the fused IV + lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar()); + + // The loop latch must have only one predecessor. Currently it is branched to + // from both the last condition block and the last loop body + fused->getLatch()->splitBasicBlock(fused->getLatch()->begin(), + "omp.fused.pre_latch", /*Before=*/true); + + // Remove unused parts + removeUnusedBlocksFromParent(oldControlBBs); + + // Invalidate old CLIs + for (CanonicalLoopInfo *L : Loops) + L->invalidate(); + +#ifndef NDEBUG + fused->assertOK(); +#endif + return fused; +} + void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) { LLVMContext &Ctx = Builder.getContext(); addLoopMetadata( diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 377f1febf6b8f..2752c2a806847 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -550,6 +550,40 @@ def TileOp : OpenMPTransformBase_Op<"tile", let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// OpenMP fuse operation +//===----------------------------------------------------------------------===// + +def FuseOp : OpenMPTransformBase_Op<"fuse"> { + let summary = "OpenMP fuse operation"; + let description = [{ + Represents the OpenMP fuse directive introduced in OpenMP 6.0. + + The construct takes a loop sequence and merges the loops specifed by the + first and count attributes and generates a loop sequence with the loops + before the first attribute untouched, the generated fused loop, and the loops + after the the first + count attribute untouched mantaining the orignal + order. If no attributes are specified all the loops in the sequence are + fused generating a single loop. + Each logical iteration of the fused loop executes a logical iteration of + each affected loop. The fused loop has the number of logical iterations + equal to the affected loop with most logical iterations. + + The first and count attributes are constant and known beforehand. + }]#clausesDescription; + + let extraClassDeclaration = [{ + IntegerAttr getFirst() { + return this->getOperation()->getAttrOfType("first"); + } + IntegerAttr getCount() { + return this->getOperation()->getAttrOfType("count"); + } + }]#clausesExtraClassDeclaration; + + let hasVerifier = 1; +} + //===----------------------------------------------------------------------===// // 2.8.3 Workshare Construct //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 1b069c62a8be9..8373a18df281a 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -3429,6 +3429,20 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) { .Case([&](UnrollHeuristicOp op) -> std::string { llvm_unreachable("heuristic unrolling does not generate a loop"); }) + .Case([&](FuseOp op) -> std::string { + unsigned int first = 0; + unsigned int count = 0; + if (op.getFirst() && op.getCount()) { + first = op.getFirst().getInt(); + count = op.getCount().getInt(); + } + unsigned opnum = generator->getOperandNumber(); + if ((first != 0 && opnum <= first - 1) || + (count != 0 && opnum >= first + 1)) + return "canonloop_fuse"; + else + return "fused"; + }) .Case([&](TileOp op) -> std::string { auto [generateesFirst, generateesCount] = op.getGenerateesODSOperandIndexAndLength(); @@ -3804,6 +3818,60 @@ std::pair TileOp::getGenerateesODSOperandIndexAndLength() { return getODSOperandIndexAndLength(odsIndex_generatees); } +//===----------------------------------------------------------------------===// +// FuseOp +//===----------------------------------------------------------------------===// + +static void printLoopTransformClis(OpAsmPrinter &p, FuseOp op, + OperandRange generatees, + OperandRange applyees) { + if (!generatees.empty()) + p << '(' << llvm::interleaved(generatees) << ')'; + + if (!applyees.empty()) + p << " <- (" << llvm::interleaved(applyees) << ')'; +} + +LogicalResult FuseOp::verify() { + if (getApplyees().size() < 2) + return emitOpError() << "must apply to at least two loops"; + + if (getFirst() && getCount()) { + unsigned int first = getFirst().getInt(); + unsigned int count = getCount().getInt(); + if (first + count - 1 > getApplyees().size()) + return emitOpError() << "the numbers of applyees must be at least first " + "minus one plus count attributes"; + if (!getGeneratees().empty() && + getGeneratees().size() != getApplyees().size() + 1 - count) + return emitOpError() << "the number of generatees must be the number of " + "aplyees plus one minus count"; + + } else { + if (!getGeneratees().empty() && getGeneratees().size() != 1) + return emitOpError() + << "in a complete fuse the number of generatees must be exactly 1"; + } + for (auto &&applyee : getApplyees()) { + auto [create, gen, cons] = decodeCli(applyee); + + if (!gen) + return emitOpError() << "applyee CLI has no generator"; + auto loop = dyn_cast_or_null(gen->getOwner()); + if (!loop) + return emitOpError() + << "currently only supports omp.canonical_loop as applyee"; + } + return success(); +} +std::pair FuseOp ::getApplyeesODSOperandIndexAndLength() { + return getODSOperandIndexAndLength(odsIndex_applyees); +} + +std::pair FuseOp::getGenerateesODSOperandIndexAndLength() { + return getODSOperandIndexAndLength(odsIndex_generatees); +} + //===----------------------------------------------------------------------===// // Critical construct (2.17.1) //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 8edec990eaaba..e6880ce33b061 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -3207,6 +3207,57 @@ static LogicalResult applyTile(omp::TileOp op, llvm::IRBuilderBase &builder, return success(); } +/// Apply a `#pragma omp fuse` / `!$omp fuse` transformation using the +/// OpenMPIRBuilder. +static LogicalResult applyFuse(omp::FuseOp op, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + llvm::OpenMPIRBuilder::LocationDescription loc(builder); + + unsigned int first = 0; + unsigned int count = 0; + if (op.getFirst() && op.getCount()) { + first = op.getFirst().getInt(); + count = op.getCount().getInt(); + } + + // Select what CLIs are going to be fused + SmallVector beforeFuse, toFuse, afterFuse; + for (size_t i = 0; i < op.getApplyees().size(); i++) { + Value applyee = op.getApplyees()[i]; + llvm::CanonicalLoopInfo *consBuilderCLI = + moduleTranslation.lookupOMPLoop(applyee); + assert(applyee && "Canonical loop must already been translated"); + if (first != 0 && i < first - 1) + beforeFuse.push_back(consBuilderCLI); + else if (count != 0 && i >= first + count - 1) + afterFuse.push_back(consBuilderCLI); + else + toFuse.push_back(consBuilderCLI); + } + assert( + (op.getGeneratees().empty() || + beforeFuse.size() + afterFuse.size() + 1 == op.getGeneratees().size()) && + "Wrong number of generatees"); + + // do the fuse + auto generatedLoop = ompBuilder->fuseLoops(loc.DL, toFuse); + if (!op.getGeneratees().empty()) { + size_t i = 0; + for (; i < beforeFuse.size(); i++) + moduleTranslation.mapOmpLoop(op.getGeneratees()[i], beforeFuse[i]); + moduleTranslation.mapOmpLoop(op.getGeneratees()[i++], generatedLoop); + for (; i < afterFuse.size(); i++) + moduleTranslation.mapOmpLoop(op.getGeneratees()[i], afterFuse[i]); + } + + // CLIs can only be consumed once + for (Value applyee : op.getApplyees()) + moduleTranslation.invalidateOmpLoop(applyee); + + return success(); +} + /// Convert an Atomic Ordering attribute to llvm::AtomicOrdering. static llvm::AtomicOrdering convertAtomicOrdering(std::optional ao) { @@ -6288,6 +6339,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder, .Case([&](omp::TileOp op) { return applyTile(op, builder, moduleTranslation); }) + .Case([&](omp::FuseOp op) { + return applyFuse(op, builder, moduleTranslation); + }) .Case([&](omp::TargetAllocMemOp) { return convertTargetAllocMemOp(*op, builder, moduleTranslation); }) diff --git a/mlir/test/Dialect/OpenMP/cli-fuse.mlir b/mlir/test/Dialect/OpenMP/cli-fuse.mlir new file mode 100644 index 0000000000000..284b8c914ae1f --- /dev/null +++ b/mlir/test/Dialect/OpenMP/cli-fuse.mlir @@ -0,0 +1,114 @@ +// RUN: mlir-opt %s | FileCheck %s --enable-var-scope +// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope + + +// Raw syntax check (MLIR output is always pretty-printed) +// CHECK-LABEL: @omp_fuse_raw( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) { +func.func @omp_fuse_raw(%tc1 : i32, %tc2 : i32) -> () { + // CHECK-NEXT: %canonloop_s0 = omp.new_cli + %canonloop_s0 = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %canonloop_s1 = omp.new_cli + %canonloop_s1 = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: %fused = omp.new_cli + %fused = "omp.new_cli" () : () -> (!omp.cli) + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + "omp.canonical_loop" (%tc1, %canonloop_s0) ({ + ^bb0(%iv_s0: i32): + // CHECK: omp.terminator + omp.terminator + }) : (i32, !omp.cli) -> () + // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + "omp.canonical_loop" (%tc2, %canonloop_s1) ({ + ^bb0(%iv_s1: i32): + // CHECK: omp.terminator + omp.terminator + }) : (i32, !omp.cli) -> () + // CHECK: omp.fuse (%fused) <- (%canonloop_s0, %canonloop_s1) + "omp.fuse"(%fused, %canonloop_s0, %canonloop_s1) <{operandSegmentSizes = array}> : (!omp.cli, !omp.cli, !omp.cli) -> () + return +} + +// Pretty syntax check +// CHECK-LABEL: @omp_fuse_pretty( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) { +func.func @omp_fuse_pretty(%tc1 : i32, %tc2 : i32) -> () { + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s1 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %fused = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + omp.canonical_loop (%canonloop_s0) %iv_s0 : i32 in range(%tc1) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + omp.canonical_loop (%canonloop_s1) %iv_s1 : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.fuse (%fused) <- (%canonloop_s0, %canonloop_s1) + omp.fuse(%fused) <- (%canonloop_s0, %canonloop_s1) + return +} + +// Specifying the generatees for omp.fuse is optional +// CHECK-LABEL: @omp_fuse_optionalgen_pretty( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) { +func.func @omp_fuse_optionalgen_pretty(%tc1 : i32, %tc2 : i32) -> () { + // CHECK-NEXT: %canonloop_s0 = omp.new_cli + %canonloop_s0 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc1) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: %canonloop_s1 = omp.new_cli + %canonloop_s1 = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.fuse <- (%canonloop_s0, %canonloop_s1) + omp.fuse <- (%canonloop_s0, %canonloop_s1) + return +} + +// Fuse with looprange attributes +// CHECK-LABEL: @omp_fuse_looprange( +// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[tc3:.+]]: i32) { +func.func @omp_fuse_looprange(%tc1 : i32, %tc2 : i32, %tc3 : i32) -> () { + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s0 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s1 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_s2 = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %canonloop_fuse = omp.new_cli + // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli + %fused = omp.new_cli + // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) { + omp.canonical_loop (%canonloop_s0) %iv_s0 : i32 in range(%tc1) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) { + omp.canonical_loop (%canonloop_s1) %iv_s1 : i32 in range(%tc2) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%[[tc3]]) { + omp.canonical_loop (%canonloop_s2) %iv_s2 : i32 in range(%tc3) { + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.fuse (%canonloop_fuse, %fused) <- (%canonloop_s0, + // %canonloop_s1, %canonloop_s2) {count = 2 : i32, first = 1 : i32} + omp.fuse(%fused, %canonloop_fuse) <- (%canonloop_s0, %canonloop_s1, %canonloop_s2) {count = 2 : i32, first = 1 : i32} + return +} + diff --git a/mlir/test/Dialect/OpenMP/invalid-fuse.mlir b/mlir/test/Dialect/OpenMP/invalid-fuse.mlir new file mode 100644 index 0000000000000..d763ffcea71a2 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/invalid-fuse.mlir @@ -0,0 +1,100 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s + + +func.func @no_loops(%tc1 : i32, %tc2 : i32) { + // expected-error@+1 {{'omp.fuse' op must apply to at least two loops}} + omp.fuse <-() + + return +} + +// ----- + +func.func @one_loop(%tc1 : i32, %tc2 : i32) { + %canonloop = omp.new_cli + omp.canonical_loop(%canonloop) %iv : i32 in range(%tc1) { + omp.terminator + } + // expected-error@+1 {{'omp.fuse' op must apply to at least two loops}} + omp.fuse <-(%canonloop) + + return +} + +// ----- + +func.func @missing_generator(%tc1 : i32, %tc2 : i32) { + // expected-error@+1 {{'omp.new_cli' op CLI has no generator}} + %canonloop = omp.new_cli + + // expected-note@+1 {{see consumer here: "omp.fuse"(%0) <{operandSegmentSizes = array}> : (!omp.cli) -> ()}} + omp.fuse <-(%canonloop) + + return +} + +// ----- + +func.func @wrong_generatees1(%tc1 : i32, %tc2 : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) { + omp.terminator + } + + %fused1 = omp.new_cli + %fused2 = omp.new_cli + // expected-error@+1 {{'omp.fuse' op in a complete fuse the number of generatees must be exactly 1}} + omp.fuse (%fused1, %fused2) <-(%canonloop1, %canonloop2) + + llvm.return +} + +// ----- + +func.func @wrong_generatees2(%tc1 : i32, %tc2 : i32, %tc3 : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + %canonloop3 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) { + omp.terminator + } + omp.canonical_loop(%canonloop3) %iv : i32 in range(%tc3) { + omp.terminator + } + + %fused = omp.new_cli + // expected-error@+1 {{'omp.fuse' op the number of generatees must be the number of aplyees plus one minus count}} + omp.fuse (%fused) <-(%canonloop1, %canonloop2, %canonloop3) {first = 1 : i32, count = 2 : i32} + + llvm.return +} + +func.func @wrong_applyees(%tc1 : i32, %tc2 : i32, %tc3 : i32) { + %canonloop1 = omp.new_cli + %canonloop2 = omp.new_cli + %canonloop3 = omp.new_cli + omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) { + omp.terminator + } + omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) { + omp.terminator + } + omp.canonical_loop(%canonloop3) %iv : i32 in range(%tc3) { + omp.terminator + } + + %fused = omp.new_cli + %canonloop_fuse = omp.new_cli + // expected-error@+1 {{'omp.fuse' op the numbers of applyees must be at least first minus one plus count attributes}} + omp.fuse (%fused, %canonloop_fuse) <-(%canonloop1, %canonloop2, %canonloop3) {first = 1 : i32, count = 5 : i32} + + llvm.return +} + diff --git a/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir new file mode 100644 index 0000000000000..0754572b24771 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir @@ -0,0 +1,100 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope + + +llvm.func @fuse_trivial_loops(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32) -> () { + %literal_cli1 = omp.new_cli + omp.canonical_loop(%literal_cli1) %iv1 : i32 in range(%tc1) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv1] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + %literal_cli2 = omp.new_cli + omp.canonical_loop(%literal_cli2) %iv2 : i32 in range(%tc2) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv2] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(21.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.fuse <- (%literal_cli1, %literal_cli2) + llvm.return +} + +// CHECK-LABEL: define void @fuse_trivial_loops( +// CHECK-SAME: ptr %[[VAL_11:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_16:.+]]) { +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER1:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER1]]: +// CHECK-NEXT: br label %[[OMP_FUSE_COMP_TC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSE_COMP_TC]]: +// CHECK-NEXT: %[[VAL_15:.+]] = icmp sgt i32 %[[VAL_5:.+]], %[[VAL_16:.+]] +// CHECK-NEXT: %[[VAL_17:.+]] = select i1 %[[VAL_15:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_16:.+]] +// CHECK-NEXT: br label %[[OMP_FUSED_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_HEADER]]: +// CHECK-NEXT: %[[VAL_4:.+]] = phi i32 [ 0, %[[VAL_18:.+]] ], [ %[[VAL_27:.+]], %[[VAL_26:.+]] ] +// CHECK-NEXT: br label %[[OMP_FUSED_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_COND]]: +// CHECK-NEXT: %[[VAL_29:.+]] = icmp ult i32 %[[VAL_4:.+]], %[[VAL_17:.+]] +// CHECK-NEXT: br i1 %[[VAL_29:.+]], label %[[OMP_FUSED_BODY:.+]], label %[[OMP_FUSED_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_BODY]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND]]: +// CHECK-NEXT: %[[VAL_3:.+]] = icmp slt i32 %[[VAL_4:.+]], %[[VAL_5:.+]] +// CHECK-NEXT: br i1 %[[VAL_3:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_FUSED_INNER_COND13:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION]]: +// CHECK-NEXT: %[[VAL_10:.+]] = getelementptr inbounds float, ptr %[[VAL_11:.+]], i32 %[[VAL_4:.+]] +// CHECK-NEXT: store float 4.200000e+01, ptr %[[VAL_10:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND13:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND13]]: +// CHECK-NEXT: %[[VAL_19:.+]] = icmp slt i32 %[[VAL_4:.+]], %[[VAL_16:.+]] +// CHECK-NEXT: br i1 %[[VAL_19:.+]], label %[[OMP_OMP_LOOP_BODY4:.+]], label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY4]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION12:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION12]]: +// CHECK-NEXT: %[[VAL_23:.+]] = getelementptr inbounds float, ptr %[[VAL_11:.+]], i32 %[[VAL_4:.+]] +// CHECK-NEXT: store float 2.100000e+01, ptr %[[VAL_23:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT11:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT11]]: +// CHECK-NEXT: br label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PRE_LATCH]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INC]]: +// CHECK-NEXT: %[[VAL_27:.+]] = add nuw i32 %[[VAL_4:.+]], 1 +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_EXIT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER7:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER7]]: +// CHECK-NEXT: ret void + diff --git a/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir new file mode 100644 index 0000000000000..0032bd86501d0 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir @@ -0,0 +1,140 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope + + +llvm.func @fuse_looprange_loops(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %tc3: i32) -> () { + %literal_cli1 = omp.new_cli + omp.canonical_loop(%literal_cli1) %iv1 : i32 in range(%tc1) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv1] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(42.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + %literal_cli2 = omp.new_cli + omp.canonical_loop(%literal_cli2) %iv2 : i32 in range(%tc2) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv2] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(21.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + %literal_cli3 = omp.new_cli + omp.canonical_loop(%literal_cli3) %iv3 : i32 in range(%tc3) { + %ptr = llvm.getelementptr inbounds %baseptr[%iv3] : (!llvm.ptr, i32) -> !llvm.ptr, f32 + %val = llvm.mlir.constant(63.0 : f32) : f32 + llvm.store %val, %ptr : f32, !llvm.ptr + omp.terminator + } + omp.fuse <- (%literal_cli1, %literal_cli2, %literal_cli3) {first = 1 : i32, count = 2 : i32} + llvm.return +} + + +// CHECK-LABEL: define void @fuse_looprange_loops( +// CHECK-SAME: ptr %[[VAL_23:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_6:.+]], i32 %[[VAL_40:.+]]) { +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER1:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER1]]: +// CHECK-NEXT: br label %[[OMP_FUSE_COMP_TC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSE_COMP_TC]]: +// CHECK-NEXT: %[[VAL_4:.+]] = icmp sgt i32 %[[VAL_5:.+]], %[[VAL_6:.+]] +// CHECK-NEXT: %[[VAL_7:.+]] = select i1 %[[VAL_4:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_6:.+]] +// CHECK-NEXT: br label %[[OMP_FUSED_PREHEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PREHEADER]]: +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_HEADER]]: +// CHECK-NEXT: %[[VAL_11:.+]] = phi i32 [ 0, %[[VAL_8:.+]] ], [ %[[VAL_12:.+]], %[[VAL_10:.+]] ] +// CHECK-NEXT: br label %[[OMP_FUSED_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_COND]]: +// CHECK-NEXT: %[[VAL_14:.+]] = icmp ult i32 %[[VAL_11:.+]], %[[VAL_7:.+]] +// CHECK-NEXT: br i1 %[[VAL_14:.+]], label %[[OMP_FUSED_BODY:.+]], label %[[OMP_FUSED_EXIT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_BODY]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND]]: +// CHECK-NEXT: %[[VAL_18:.+]] = icmp slt i32 %[[VAL_11:.+]], %[[VAL_5:.+]] +// CHECK-NEXT: br i1 %[[VAL_18:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_FUSED_INNER_COND25:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION]]: +// CHECK-NEXT: %[[VAL_22:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_11:.+]] +// CHECK-NEXT: store float 4.200000e+01, ptr %[[VAL_22:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND25:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INNER_COND25]]: +// CHECK-NEXT: %[[VAL_25:.+]] = icmp slt i32 %[[VAL_11:.+]], %[[VAL_6:.+]] +// CHECK-NEXT: br i1 %[[VAL_25:.+]], label %[[OMP_OMP_LOOP_BODY4:.+]], label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY4]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION12:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION12]]: +// CHECK-NEXT: %[[VAL_29:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_11:.+]] +// CHECK-NEXT: store float 2.100000e+01, ptr %[[VAL_29:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT11:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT11]]: +// CHECK-NEXT: br label %[[OMP_FUSED_PRE_LATCH:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_PRE_LATCH]]: +// CHECK-NEXT: br label %[[OMP_FUSED_INC:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_INC]]: +// CHECK-NEXT: %[[VAL_12:.+]] = add nuw i32 %[[VAL_11:.+]], 1 +// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_EXIT]]: +// CHECK-NEXT: br label %[[OMP_FUSED_AFTER:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_FUSED_AFTER]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER7:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER7]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER13:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER13]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_HEADER14:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_HEADER14]]: +// CHECK-NEXT: %[[VAL_36:.+]] = phi i32 [ 0, %[[VAL_33:.+]] ], [ %[[VAL_37:.+]], %[[VAL_35:.+]] ] +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_COND15:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_COND15]]: +// CHECK-NEXT: %[[VAL_39:.+]] = icmp ult i32 %[[VAL_36:.+]], %[[VAL_40:.+]] +// CHECK-NEXT: br i1 %[[VAL_39:.+]], label %[[OMP_OMP_LOOP_BODY16:.+]], label %[[OMP_OMP_LOOP_EXIT18:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_BODY16]]: +// CHECK-NEXT: br label %[[OMP_LOOP_REGION24:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_LOOP_REGION24]]: +// CHECK-NEXT: %[[VAL_44:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_36:.+]] +// CHECK-NEXT: store float 6.300000e+01, ptr %[[VAL_44:.+]], align 4 +// CHECK-NEXT: br label %[[OMP_REGION_CONT23:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_REGION_CONT23]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_INC17:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_INC17]]: +// CHECK-NEXT: %[[VAL_37:.+]] = add nuw i32 %[[VAL_36:.+]], 1 +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_HEADER14:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_EXIT18]]: +// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER19:.+]] +// CHECK-EMPTY: +// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER19]]: +// CHECK-NEXT: ret void + diff --git a/openmp/runtime/test/transform/fuse/do-looprange.f90 b/openmp/runtime/test/transform/fuse/do-looprange.f90 new file mode 100644 index 0000000000000..8c62b24c4744f --- /dev/null +++ b/openmp/runtime/test/transform/fuse/do-looprange.f90 @@ -0,0 +1,60 @@ +! RUN: %flang %flags %openmp_flags -fopenmp-version=60 %s -o %t.exe +! RUN: %t.exe | FileCheck %s --match-full-lines + +program fuse_full + implicit none + integer i, j, k, u + + print *, 'do' + + !$OMP FUSE LOOPRANGE(2,2) + do i=5, 25, 5 + print '("i=", I0)', i + end do + do j=10, 100, 10 + print '("j=", I0)', j + end do + do k=10, 0, -1 + print '("k=", I0)', k + end do + do u=5, 25, 5 + print '("u=", I0)', u + end do + !$OMP END FUSE + + print *, 'done' +end program + +! CHECK: do +! CHECK-NEXT: i=5 +! CHECK-NEXT: i=10 +! CHECK-NEXT: i=15 +! CHECK-NEXT: i=20 +! CHECK-NEXT: i=25 +! CHECK-NEXT: j=10 +! CHECK-NEXT: k=10 +! CHECK-NEXT: j=20 +! CHECK-NEXT: k=9 +! CHECK-NEXT: j=30 +! CHECK-NEXT: k=8 +! CHECK-NEXT: j=40 +! CHECK-NEXT: k=7 +! CHECK-NEXT: j=50 +! CHECK-NEXT: k=6 +! CHECK-NEXT: j=60 +! CHECK-NEXT: k=5 +! CHECK-NEXT: j=70 +! CHECK-NEXT: k=4 +! CHECK-NEXT: j=80 +! CHECK-NEXT: k=3 +! CHECK-NEXT: j=90 +! CHECK-NEXT: k=2 +! CHECK-NEXT: j=100 +! CHECK-NEXT: k=1 +! CHECK-NEXT: k=0 +! CHECK-NEXT: u=5 +! CHECK-NEXT: u=10 +! CHECK-NEXT: u=15 +! CHECK-NEXT: u=20 +! CHECK-NEXT: u=25 +! CHECK-NEXT: done diff --git a/openmp/runtime/test/transform/fuse/do.f90 b/openmp/runtime/test/transform/fuse/do.f90 new file mode 100644 index 0000000000000..d4496bce4d723 --- /dev/null +++ b/openmp/runtime/test/transform/fuse/do.f90 @@ -0,0 +1,52 @@ +! RUN: %flang %flags %openmp_flags -fopenmp-version=60 %s -o %t.exe +! RUN: %t.exe | FileCheck %s --match-full-lines + +program fuse_full + implicit none + integer i, j, k + + print *, 'do' + + !$OMP FUSE + do i=5, 25, 5 + print '("i=", I0)', i + end do + do j=10, 100, 10 + print '("j=", I0)', j + end do + do k=10, 0, -1 + print '("k=", I0)', k + end do + !$OMP END FUSE + + print *, 'done' +end program + +! CHECK: do +! CHECK-NEXT: i=5 +! CHECK-NEXT: j=10 +! CHECK-NEXT: k=10 +! CHECK-NEXT: i=10 +! CHECK-NEXT: j=20 +! CHECK-NEXT: k=9 +! CHECK-NEXT: i=15 +! CHECK-NEXT: j=30 +! CHECK-NEXT: k=8 +! CHECK-NEXT: i=20 +! CHECK-NEXT: j=40 +! CHECK-NEXT: k=7 +! CHECK-NEXT: i=25 +! CHECK-NEXT: j=50 +! CHECK-NEXT: k=6 +! CHECK-NEXT: j=60 +! CHECK-NEXT: k=5 +! CHECK-NEXT: j=70 +! CHECK-NEXT: k=4 +! CHECK-NEXT: j=80 +! CHECK-NEXT: k=3 +! CHECK-NEXT: j=90 +! CHECK-NEXT: k=2 +! CHECK-NEXT: j=100 +! CHECK-NEXT: k=1 +! CHECK-NEXT: k=0 +! CHECK-NEXT: done