diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h index 8571a2d9d0939..252a78648dd74 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h @@ -180,6 +180,11 @@ static constexpr StringLiteral getRoutineInfoAttrName() { return StringLiteral("acc.routine_info"); } +/// Used to check whether the current operation is an `acc routine` +inline bool isAccRoutineOp(mlir::Operation *op) { + return op->hasAttr(mlir::acc::getRoutineInfoAttrName()); +} + static constexpr StringLiteral getFromDefaultClauseAttrName() { return StringLiteral("acc.from_default"); } diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td index cad78df2fbb0b..713aaabee65f0 100644 --- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td @@ -63,6 +63,34 @@ def ACCImplicitData : Pass<"acc-implicit-data", "mlir::ModuleOp"> { ]; } +def ACCImplicitDeclare : Pass<"acc-implicit-declare", "mlir::ModuleOp"> { + let summary = "Applies implicit acc declare to globals referenced in compute and routine acc regions"; + let description = [{ + This pass applies implicit `acc declare` actions to global variables + referenced in OpenACC compute regions and routine functions. + + The pass performs the following actions: + + 1. Hoists address-of operations for non-constant globals out of OpenACC + regions when they can be implicitly mapped rather than declared. + + 2. Collects global symbols referenced in: + - OpenACC compute constructs (parallel, kernels, serial) + - Functions marked with acc routine + - Initialization regions of existing acc declare globals + - Private/firstprivate/reduction recipe operations + + 3. Marks collected globals with the acc.declare attribute using the + copyin data clause. + + The pass avoids unnecessary declare marking by: + - Skipping function symbols (which use acc routine instead) + - Hoisting non-constant global references that can use implicit mapping + - Only processing symbols that are not already valid in device regions + }]; + let dependentDialects = ["mlir::acc::OpenACCDialect"]; +} + def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> { let summary = "Generate implicit acc routine for functions in acc regions"; let description = [{ diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitDeclare.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitDeclare.cpp new file mode 100644 index 0000000000000..766f690e21459 --- /dev/null +++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitDeclare.cpp @@ -0,0 +1,429 @@ +//===- ACCImplicitDeclare.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass applies implicit `acc declare` actions to global variables +// referenced in OpenACC compute regions and routine functions. +// +// Overview: +// --------- +// Global references in an acc regions (for globals not marked with `acc +// declare` by the user) can be handled in one of two ways: +// - Mapped through data clauses +// - Implicitly marked as `acc declare` (this pass) +// +// Thus, the OpenACC specification focuses solely on implicit data mapping rules +// whose implementation is captured in `ACCImplicitData` pass. +// +// However, it is both advantageous and required for certain cases to +// use implicit `acc declare` instead: +// - Any functions that are implicitly marked as `acc routine` through +// `ACCImplicitRoutine` may reference globals. Since data mapping +// is only possible for compute regions, such globals can only be +// made available on device through `acc declare`. +// - Compiler can generate and use globals for cases needed in IR +// representation such as type descriptors or various names needed for +// runtime calls and error reporting - such cases often are introduced +// after a frontend semantic checking is done since it is related to +// implementation detail. Thus, such compiler generated globals would +// not have been visible for a user to mark with `acc declare`. +// - Constant globals such as filename strings or data initialization values +// are values that do not get mutated but are still needed for appropriate +// runtime execution. If a kernel is launched 1000 times, it is not a +// good idea to map such a global 1000 times. Therefore, such globals +// benefit from being marked with `acc declare`. +// +// This pass automatically +// marks global variables with the `acc.declare` attribute when they are +// referenced in OpenACC compute constructs or routine functions and meet +// the criteria noted above, ensuring +// they are properly handled for device execution. +// +// The pass performs two main optimizations: +// +// 1. Hoisting: For non-constant globals referenced in compute regions, the +// pass hoists the address-of operation out of the region when possible, +// allowing them to be implicitly mapped through normal data clause +// mechanisms rather than requiring declare marking. +// +// 2. Declaration: For globals that must be available on the device (constants, +// globals in routines, globals in recipe operations), the pass adds the +// `acc.declare` attribute with the copyin data clause. +// +// Requirements: +// ------------- +// To use this pass in a pipeline, the following requirements must be met: +// +// 1. Operation Interface Implementation: Operations that compute addresses +// of global variables must implement the `acc::AddressOfGlobalOpInterface` +// and those that represent globals must implement the +// `acc::GlobalOpInterface`. Additionally, any operations that indirectly +// access globals must implement the `acc::IndirectGlobalAccessOpInterface`. +// +// 2. Analysis Registration (Optional): If custom behavior is needed for +// determining if a symbol use is valid within GPU regions, the dialect +// should pre-register the `acc::OpenACCSupport` analysis. +// +// Examples: +// --------- +// +// Example 1: Non-constant global in compute region (hoisted) +// +// Before: +// memref.global @g_scalar : memref = dense<0.0> +// func.func @test() { +// acc.serial { +// %addr = memref.get_global @g_scalar : memref +// %val = memref.load %addr[] : memref +// acc.yield +// } +// } +// +// After: +// memref.global @g_scalar : memref = dense<0.0> +// func.func @test() { +// %addr = memref.get_global @g_scalar : memref +// acc.serial { +// %val = memref.load %addr[] : memref +// acc.yield +// } +// } +// +// Example 2: Constant global in compute region (declared) +// +// Before: +// memref.global constant @g_const : memref = dense<1.0> +// func.func @test() { +// acc.serial { +// %addr = memref.get_global @g_const : memref +// %val = memref.load %addr[] : memref +// acc.yield +// } +// } +// +// After: +// memref.global constant @g_const : memref = dense<1.0> +// {acc.declare = #acc.declare} +// func.func @test() { +// acc.serial { +// %addr = memref.get_global @g_const : memref +// %val = memref.load %addr[] : memref +// acc.yield +// } +// } +// +// Example 3: Global in acc routine (declared) +// +// Before: +// memref.global @g_data : memref = dense<0.0> +// acc.routine @routine_0 func(@device_func) +// func.func @device_func() attributes {acc.routine_info = ...} { +// %addr = memref.get_global @g_data : memref +// %val = memref.load %addr[] : memref +// } +// +// After: +// memref.global @g_data : memref = dense<0.0> +// {acc.declare = #acc.declare} +// acc.routine @routine_0 func(@device_func) +// func.func @device_func() attributes {acc.routine_info = ...} { +// %addr = memref.get_global @g_data : memref +// %val = memref.load %addr[] : memref +// } +// +// Example 4: Global in private recipe (declared if recipe is used) +// +// Before: +// memref.global @g_init : memref = dense<0.0> +// acc.private.recipe @priv_recipe : memref init { +// ^bb0(%arg0: memref): +// %alloc = memref.alloc() : memref +// %global = memref.get_global @g_init : memref +// %val = memref.load %global[] : memref +// memref.store %val, %alloc[] : memref +// acc.yield %alloc : memref +// } destroy { ... } +// func.func @test() { +// %var = memref.alloc() : memref +// %priv = acc.private varPtr(%var : memref) +// recipe(@priv_recipe) -> memref +// acc.parallel private(%priv : memref) { ... } +// } +// +// After: +// memref.global @g_init : memref = dense<0.0> +// {acc.declare = #acc.declare} +// acc.private.recipe @priv_recipe : memref init { +// ^bb0(%arg0: memref): +// %alloc = memref.alloc() : memref +// %global = memref.get_global @g_init : memref +// %val = memref.load %global[] : memref +// memref.store %val, %alloc[] : memref +// acc.yield %alloc : memref +// } destroy { ... } +// func.func @test() { +// %var = memref.alloc() : memref +// %priv = acc.private varPtr(%var : memref) +// recipe(@priv_recipe) -> memref +// acc.parallel private(%priv : memref) { ... } +// } +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/OpenACC/Transforms/Passes.h" + +#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/FunctionInterfaces.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TypeSwitch.h" + +namespace mlir { +namespace acc { +#define GEN_PASS_DEF_ACCIMPLICITDECLARE +#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc" +} // namespace acc +} // namespace mlir + +#define DEBUG_TYPE "acc-implicit-declare" + +using namespace mlir; + +namespace { + +using GlobalOpSetT = llvm::SmallSetVector; + +/// Checks whether a use of the requested `globalOp` should be considered +/// for hoisting out of acc region due to avoid `acc declare`ing something +/// that instead should be implicitly mapped. +static bool isGlobalUseCandidateForHoisting(Operation *globalOp, + Operation *user, + SymbolRefAttr symbol, + acc::OpenACCSupport &accSupport) { + // This symbol is valid in GPU region. This means semantics + // would change if moved to host - therefore it is not a candidate. + if (accSupport.isValidSymbolUse(user, symbol)) + return false; + + bool isConstant = false; + bool isFunction = false; + + if (auto globalVarOp = dyn_cast(globalOp)) + isConstant = globalVarOp.isConstant(); + + if (isa(globalOp)) + isFunction = true; + + // Constants should be kept in device code to ensure they are duplicated. + // Function references should be kept in device code to ensure their device + // addresses are computed. Everything else should be hoisted since we already + // proved they are not valid symbols in GPU region. + return !isConstant && !isFunction; +} + +/// Checks whether it is valid to use acc.declare marking on the global. +bool isValidForAccDeclare(Operation *globalOp) { + // For functions - we use acc.routine marking instead. + return !isa(globalOp); +} + +/// Checks whether a recipe operation has meaningful use of its symbol that +/// justifies processing its regions for global references. Returns false if: +/// 1. The recipe has no symbol uses at all, or +/// 2. The only symbol use is the recipe's own symbol definition +template +static bool hasRelevantRecipeUse(RecipeOpT &recipeOp, ModuleOp &mod) { + std::optional symbolUses = recipeOp.getSymbolUses(mod); + + // No recipe symbol uses. + if (!symbolUses.has_value() || symbolUses->empty()) + return false; + + // If more than one use, assume it's used. + auto begin = symbolUses->begin(); + auto end = symbolUses->end(); + if (begin != end && std::next(begin) != end) + return true; + + // If single use, check if the use is the recipe itself. + const SymbolTable::SymbolUse &use = *symbolUses->begin(); + return use.getUser() != recipeOp.getOperation(); +} + +// Hoists addr_of operations for non-constant globals out of OpenACC regions. +// This way - they are implicitly mapped instead of being considered for +// implicit declare. +template +static void hoistNonConstantDirectUses(AccConstructT accOp, + acc::OpenACCSupport &accSupport) { + accOp.walk([&](acc::AddressOfGlobalOpInterface addrOfOp) { + SymbolRefAttr symRef = addrOfOp.getSymbol(); + if (symRef) { + Operation *globalOp = + SymbolTable::lookupNearestSymbolFrom(addrOfOp, symRef); + if (isGlobalUseCandidateForHoisting(globalOp, addrOfOp, symRef, + accSupport)) { + addrOfOp->moveBefore(accOp); + LLVM_DEBUG( + llvm::dbgs() << "Hoisted:\n\t" << addrOfOp << "\n\tfrom:\n\t"; + accOp->print(llvm::dbgs(), + OpPrintingFlags{}.skipRegions().enableDebugInfo()); + llvm::dbgs() << "\n"); + } + } + }); +} + +// Collects the globals referenced in a device region +static void collectGlobalsFromDeviceRegion(Region ®ion, + GlobalOpSetT &globals, + acc::OpenACCSupport &accSupport, + SymbolTable &symTab) { + region.walk([&](Operation *op) { + // 1) Only consider relevant operations which use symbols + auto addrOfOp = dyn_cast(op); + if (addrOfOp) { + SymbolRefAttr symRef = addrOfOp.getSymbol(); + // 2) Found an operation which uses the symbol. Next determine if it + // is a candidate for `acc declare`. Some of the criteria considered + // is whether this symbol is not already a device one (either because + // acc declare is already used or this is a CUF global). + Operation *globalOp = nullptr; + bool isCandidate = !accSupport.isValidSymbolUse(op, symRef, &globalOp); + // 3) Add the candidate to the set of globals to be `acc declare`d. + if (isCandidate && globalOp && isValidForAccDeclare(globalOp)) + globals.insert(globalOp); + } else if (auto indirectAccessOp = + dyn_cast(op)) { + // Process operations that indirectly access globals + llvm::SmallVector symbols; + indirectAccessOp.getReferencedSymbols(symbols, &symTab); + for (SymbolRefAttr symRef : symbols) + if (Operation *globalOp = symTab.lookup(symRef.getLeafReference())) + if (isValidForAccDeclare(globalOp)) + globals.insert(globalOp); + } + }); +} + +// Adds the declare attribute to the operation `op`. +static void addDeclareAttr(MLIRContext *context, Operation *op, + acc::DataClause clause) { + op->setAttr(acc::getDeclareAttrName(), + acc::DeclareAttr::get(context, + acc::DataClauseAttr::get(context, clause))); +} + +// This pass applies implicit declare actions for globals referenced in +// OpenACC compute and routine regions. +class ACCImplicitDeclare + : public acc::impl::ACCImplicitDeclareBase { +public: + using ACCImplicitDeclareBase::ACCImplicitDeclareBase; + + void runOnOperation() override { + ModuleOp mod = getOperation(); + MLIRContext *context = &getContext(); + acc::OpenACCSupport &accSupport = getAnalysis(); + + // 1) Start off by hoisting any AddressOf operations out of acc region + // for any cases we do not want to `acc declare`. This is because we can + // rely on implicit data mapping in majority of cases without uselessly + // polluting the device globals. + mod.walk([&](Operation *op) { + TypeSwitch(op) + .Case( + [&](auto accOp) { + hoistNonConstantDirectUses(accOp, accSupport); + }); + }); + + // 2) Collect global symbols which need to be `acc declare`d. Do it for + // compute regions, acc routine, and existing globals with the declare + // attribute. + SymbolTable symTab(mod); + GlobalOpSetT globalsToAccDeclare; + mod.walk([&](Operation *op) { + TypeSwitch(op) + .Case( + [&](auto accOp) { + collectGlobalsFromDeviceRegion( + accOp.getRegion(), globalsToAccDeclare, accSupport, symTab); + }) + .Case([&](auto func) { + if (acc::isAccRoutineOp(func) && !func.isExternal()) + collectGlobalsFromDeviceRegion(func.getFunctionBody(), + globalsToAccDeclare, accSupport, + symTab); + }) + .Case([&](auto globalVarOp) { + if (globalVarOp->getAttr(acc::getDeclareAttrName())) + if (Region *initRegion = globalVarOp.getInitRegion()) + collectGlobalsFromDeviceRegion(*initRegion, globalsToAccDeclare, + accSupport, symTab); + }) + .Case([&](auto privateRecipe) { + if (hasRelevantRecipeUse(privateRecipe, mod)) { + collectGlobalsFromDeviceRegion(privateRecipe.getInitRegion(), + globalsToAccDeclare, accSupport, + symTab); + collectGlobalsFromDeviceRegion(privateRecipe.getDestroyRegion(), + globalsToAccDeclare, accSupport, + symTab); + } + }) + .Case([&](auto firstprivateRecipe) { + if (hasRelevantRecipeUse(firstprivateRecipe, mod)) { + collectGlobalsFromDeviceRegion(firstprivateRecipe.getInitRegion(), + globalsToAccDeclare, accSupport, + symTab); + collectGlobalsFromDeviceRegion( + firstprivateRecipe.getDestroyRegion(), globalsToAccDeclare, + accSupport, symTab); + collectGlobalsFromDeviceRegion(firstprivateRecipe.getCopyRegion(), + globalsToAccDeclare, accSupport, + symTab); + } + }) + .Case([&](auto reductionRecipe) { + if (hasRelevantRecipeUse(reductionRecipe, mod)) { + collectGlobalsFromDeviceRegion(reductionRecipe.getInitRegion(), + globalsToAccDeclare, accSupport, + symTab); + collectGlobalsFromDeviceRegion( + reductionRecipe.getCombinerRegion(), globalsToAccDeclare, + accSupport, symTab); + } + }); + }); + + // 3) Finally, generate the appropriate declare actions needed to ensure + // this is considered for device global. + for (Operation *globalOp : globalsToAccDeclare) { + LLVM_DEBUG( + llvm::dbgs() << "Global is being `acc declare copyin`d: "; + globalOp->print(llvm::dbgs(), + OpPrintingFlags{}.skipRegions().enableDebugInfo()); + llvm::dbgs() << "\n"); + + // Mark it as declare copyin. + addDeclareAttr(context, globalOp, acc::DataClause::acc_copyin); + + // TODO: May need to create the global constructor which does the mapping + // action. It is not yet clear if this is needed yet (since the globals + // might just end up in the GPU image without requiring mapping via + // runtime). + } + } +}; + +} // namespace diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt index 028af0362f26e..2c6da87c66a11 100644 --- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIROpenACCTransforms ACCImplicitData.cpp + ACCImplicitDeclare.cpp ACCImplicitRoutine.cpp LegalizeDataValues.cpp diff --git a/mlir/test/Dialect/OpenACC/acc-implicit-declare.mlir b/mlir/test/Dialect/OpenACC/acc-implicit-declare.mlir new file mode 100644 index 0000000000000..74ff3384c093c --- /dev/null +++ b/mlir/test/Dialect/OpenACC/acc-implicit-declare.mlir @@ -0,0 +1,175 @@ +// RUN: mlir-opt %s --pass-pipeline="builtin.module(acc-implicit-declare)" -split-input-file 2>&1 | FileCheck %s + +// ----- + +// Test that non-constant scalar globals in compute regions are hoisted +// instead of being marked with acc declare + +memref.global @gscalar : memref = dense<0.0> + +func.func @test_scalar_in_serial() { + acc.serial { + %addr = memref.get_global @gscalar : memref + %load = memref.load %addr[] : memref + acc.yield + } + return +} + +// Expected to hoist this global access out of acc region instead of marking +// with `acc declare`. +// CHECK-LABEL: func.func @test_scalar_in_serial +// CHECK: memref.get_global @gscalar +// CHECK: acc.serial +// CHECK-NOT: acc.declare + +// ----- + +// Test that constant globals are marked with acc declare + +memref.global constant @gscalarconst : memref = dense<1.0> + +func.func @test_constant_in_serial() { + acc.serial { + %addr = memref.get_global @gscalarconst : memref + %load = memref.load %addr[] : memref + acc.yield + } + return +} + +// This is expected to be `acc declare`'d since it is a constant. +// CHECK: memref.global constant @gscalarconst {{.*}} {acc.declare = #acc.declare} + +// ----- + +// Test globals referenced in acc routine functions + +memref.global @gscalar_routine : memref = dense<0.0> + +acc.routine @acc_routine_0 func(@test_scalar_in_accroutine) +func.func @test_scalar_in_accroutine() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>} { + %addr = memref.get_global @gscalar_routine : memref + %load = memref.load %addr[] : memref + return +} + +// Global should be acc declare'd because it's in an acc routine +// CHECK: memref.global @gscalar_routine {{.*}} {acc.declare = #acc.declare} + +// ----- + +// Test constant globals in acc routine + +memref.global constant @gscalarconst_routine : memref = dense<1.0> + +acc.routine @acc_routine_0 func(@test_constant_in_accroutine) +func.func @test_constant_in_accroutine() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>} { + %addr = memref.get_global @gscalarconst_routine : memref + %load = memref.load %addr[] : memref + return +} + +// CHECK: memref.global constant @gscalarconst_routine {{.*}} {acc.declare = #acc.declare} + +// ----- + +// Test acc.private.recipe with global reference - referenced variant + +memref.global @global_for_private : memref = dense<0.0> + +acc.private.recipe @private_recipe_with_global : memref init { +^bb0(%arg0: memref): + %0 = memref.alloc() : memref + %global_addr = memref.get_global @global_for_private : memref + %global_val = memref.load %global_addr[] : memref + memref.store %global_val, %0[] : memref + acc.yield %0 : memref +} destroy { +^bb0(%arg0: memref): + memref.dealloc %arg0 : memref + acc.terminator +} + +func.func @test_private_recipe_referenced() { + %var = memref.alloc() : memref + %priv = acc.private varPtr(%var : memref) recipe(@private_recipe_with_global) -> memref + acc.parallel private(%priv : memref) { + %load = memref.load %var[] : memref + acc.yield + } + memref.dealloc %var : memref + return +} + +// Global should be acc declare'd because the recipe is referenced +// CHECK: memref.global @global_for_private {{.*}} {acc.declare = #acc.declare} + +// ----- + +// Test acc.private.recipe with global reference - unreferenced variant + +memref.global @global_for_private_unused : memref = dense<0.0> + +acc.private.recipe @private_recipe_unused : memref init { +^bb0(%arg0: memref): + %0 = memref.alloc() : memref + %global_addr = memref.get_global @global_for_private_unused : memref + %global_val = memref.load %global_addr[] : memref + memref.store %global_val, %0[] : memref + acc.yield %0 : memref +} destroy { +^bb0(%arg0: memref): + memref.dealloc %arg0 : memref + acc.terminator +} + +func.func @test_private_recipe_not_referenced() { + %var = memref.alloc() : memref + acc.parallel { + %load = memref.load %var[] : memref + acc.yield + } + memref.dealloc %var : memref + return +} + +// Global should NOT be acc declare'd because the recipe is not referenced +// CHECK-NOT: memref.global @global_for_private_unused {{.*}} {acc.declare + +// ----- + +// Test globals in different compute constructs (parallel, kernels, serial) + +memref.global @global_parallel : memref = dense<0.0> +memref.global @global_kernels : memref = dense<0.0> +memref.global constant @global_serial_const : memref = dense<1.0> + +func.func @test_multiple_constructs() { + acc.parallel { + %addr = memref.get_global @global_parallel : memref + %load = memref.load %addr[] : memref + acc.yield + } + acc.kernels { + %addr = memref.get_global @global_kernels : memref + %load = memref.load %addr[] : memref + acc.terminator + } + acc.serial { + %addr = memref.get_global @global_serial_const : memref + %load = memref.load %addr[] : memref + acc.yield + } + return +} + +// Non-constant globals ARE hoisted before their compute regions +// Constant global should be marked with acc.declare +// CHECK: memref.global constant @global_serial_const {{.*}} {acc.declare = #acc.declare} +// CHECK-LABEL: func.func @test_multiple_constructs +// CHECK: memref.get_global @global_parallel +// CHECK-NEXT: acc.parallel +// CHECK: memref.get_global @global_kernels +// CHECK-NEXT: acc.kernels +