diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index 18b3b790338d8..0d7c3be240c99 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -36,10 +36,6 @@ std::unique_ptr> createSimplifyAffineStructuresPass(); std::unique_ptr> createAffineLoopInvariantCodeMotionPass(); -/// Creates a pass to convert all parallel affine.for's into 1-d affine.parallel -/// ops. -std::unique_ptr> createAffineParallelizePass(); - /// Performs packing (or explicit copying) of accessed memref regions into /// buffers in the specified faster memory space through either pointwise copies /// or DMA operations. diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index 810640058155f..06e0920413a95 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -112,11 +112,6 @@ def AffineVectorize : FunctionPass<"affine-super-vectorize"> { ]; } -def AffineParallelize : FunctionPass<"affine-parallelize"> { - let summary = "Convert affine.for ops into 1-D affine.parallel"; - let constructor = "mlir::createAffineParallelizePass()"; -} - def SimplifyAffineStructures : FunctionPass<"simplify-affine-structures"> { let summary = "Simplify affine expressions in maps/sets and normalize " "memrefs"; diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h index 19df93f760f5e..a2c0211b301e3 100644 --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -15,16 +15,9 @@ namespace mlir { -class AffineForOp; class AffineIfOp; -class AffineParallelOp; struct LogicalResult; -/// Replaces parallel affine.for op with 1-d affine.parallel op. -/// mlir::isLoopParallel detect the parallel affine.for ops. -/// There is no cost model currently used to drive this parallelization. -void affineParallelize(AffineForOp forOp); - /// Hoists out affine.if/else to as high as possible, i.e., past all invariant /// affine.fors/parallel's. Returns success if any hoisting happened; folded` is /// set to true if the op was folded or erased. This hoisting could lead to diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp deleted file mode 100644 index b3651e2022458..0000000000000 --- a/mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp +++ /dev/null @@ -1,50 +0,0 @@ -//===- AffineParallelize.cpp - Affineparallelize Pass---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a parallelizer for affine loop nests that is able to -// perform inner or outer loop parallelization. -// -//===----------------------------------------------------------------------===// - -#include "PassDetail.h" -#include "mlir/Analysis/AffineStructures.h" -#include "mlir/Analysis/LoopAnalysis.h" -#include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Affine/IR/AffineValueMap.h" -#include "mlir/Dialect/Affine/Passes.h" -#include "mlir/Dialect/Affine/Passes.h.inc" -#include "mlir/Dialect/Affine/Utils.h" -#include "mlir/Transforms/LoopUtils.h" -#include "llvm/Support/Debug.h" - -#define DEBUG_TYPE "affine-parallel" - -using namespace mlir; - -namespace { -/// Convert all parallel affine.for op into 1-D affine.parallel op. -struct AffineParallelize : public AffineParallelizeBase { - void runOnFunction() override; -}; -} // namespace - -void AffineParallelize::runOnFunction() { - FuncOp f = getFunction(); - SmallVector parallelizableLoops; - f.walk([&](AffineForOp loop) { - if (isLoopParallel(loop)) - parallelizableLoops.push_back(loop); - }); - for (AffineForOp loop : parallelizableLoops) - affineParallelize(loop); -} - -std::unique_ptr> mlir::createAffineParallelizePass() { - return std::make_unique(); -} diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt index 369874830c563..0098c3e210914 100644 --- a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -1,7 +1,6 @@ add_mlir_dialect_library(MLIRAffineTransforms AffineDataCopyGeneration.cpp AffineLoopInvariantCodeMotion.cpp - AffineParallelize.cpp LoopTiling.cpp LoopUnroll.cpp LoopUnrollAndJam.cpp diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp index f1a9a0ce43b35..811579bb6c8c7 100644 --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -129,20 +129,6 @@ static AffineIfOp hoistAffineIfOp(AffineIfOp ifOp, Operation *hoistOverOp) { return hoistedIfOp; } -/// Replace affine.for with a 1-d affine.parallel by moving the former's body -/// into the latter one. -void mlir::affineParallelize(AffineForOp forOp) { - Location loc = forOp.getLoc(); - OpBuilder outsideBuilder(forOp); - // Create empty 1-D affine.parallel op. - AffineParallelOp newPloop = outsideBuilder.create( - loc, forOp.getLowerBoundMap(), forOp.getLowerBoundOperands(), - forOp.getUpperBoundMap(), forOp.getUpperBoundOperands()); - // Steal the body of the old affine for op and erase it. - newPloop.region().takeBody(forOp.region()); - forOp.erase(); -} - // Returns success if any hoisting happened. LogicalResult mlir::hoistAffineIfOp(AffineIfOp ifOp, bool *folded) { // Apply canonicalization patterns and folding - this is necessary for the diff --git a/mlir/test/Dialect/Affine/parallelism-detection.mlir b/mlir/test/Dialect/Affine/parallelism-detection.mlir new file mode 100644 index 0000000000000..0788e6f8fb208 --- /dev/null +++ b/mlir/test/Dialect/Affine/parallelism-detection.mlir @@ -0,0 +1,47 @@ +// RUN: mlir-opt -allow-unregistered-dialect %s -test-detect-parallel -split-input-file -verify-diagnostics | FileCheck %s + +// CHECK-LABEL: func @loop_nest_3d_outer_two_parallel +func @loop_nest_3d_outer_two_parallel(%N : index) { + %0 = alloc() : memref<1024 x 1024 x vector<64xf32>> + %1 = alloc() : memref<1024 x 1024 x vector<64xf32>> + %2 = alloc() : memref<1024 x 1024 x vector<64xf32>> + affine.for %i = 0 to %N { + // expected-remark@-1 {{parallel loop}} + affine.for %j = 0 to %N { + // expected-remark@-1 {{parallel loop}} + affine.for %k = 0 to %N { + // expected-remark@-1 {{sequential loop}} + %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>> + %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>> + %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>> + %8 = mulf %5, %6 : vector<64xf32> + %9 = addf %7, %8 : vector<64xf32> + affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>> + } + } + } + return +} + +// ----- + +// CHECK-LABEL: unknown_op_conservative +func @unknown_op_conservative() { + affine.for %i = 0 to 10 { + // expected-remark@-1 {{sequential loop}} + "unknown"() : () -> () + } + return +} + +// ----- + +// CHECK-LABEL: non_affine_load +func @non_affine_load() { + %0 = alloc() : memref<100 x f32> + affine.for %i = 0 to 100 { + // expected-remark@-1 {{sequential loop}} + load %0[%i] : memref<100 x f32> + } + return +} diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir deleted file mode 100644 index 5287628185c5b..0000000000000 --- a/mlir/test/Dialect/Affine/parallelize.mlir +++ /dev/null @@ -1,118 +0,0 @@ -// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize| FileCheck %s - -// For multiple nested for-loops. -// CHECK-DAG: [[MAP5:#map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0 + d1, d2 * 2 + d3, d4 * 2 + d5, d6 + d7)> -// CHECK-LABEL: func @reduce_window_max() { -func @reduce_window_max() { - %cst = constant 0.000000e+00 : f32 - %0 = alloc() : memref<1x8x8x64xf32> - %1 = alloc() : memref<1x18x18x64xf32> - affine.for %arg0 = 0 to 1 { - affine.for %arg1 = 0 to 8 { - affine.for %arg2 = 0 to 8 { - affine.for %arg3 = 0 to 64 { - affine.store %cst, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32> - } - } - } - } - affine.for %arg0 = 0 to 1 { - affine.for %arg1 = 0 to 8 { - affine.for %arg2 = 0 to 8 { - affine.for %arg3 = 0 to 64 { - affine.for %arg4 = 0 to 1 { - affine.for %arg5 = 0 to 3 { - affine.for %arg6 = 0 to 3 { - affine.for %arg7 = 0 to 1 { - %2 = affine.load %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32> - %3 = affine.load %1[%arg0 + %arg4, %arg1 * 2 + %arg5, %arg2 * 2 + %arg6, %arg3 + %arg7] : memref<1x18x18x64xf32> - %4 = cmpf "ogt", %2, %3 : f32 - %5 = select %4, %2, %3 : f32 - affine.store %5, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32> - } - } - } - } - } - } - } - } - return -} - -// CHECK: %[[cst:.*]] = constant 0.000000e+00 : f32 -// CHECK: %[[v0:.*]] = alloc() : memref<1x8x8x64xf32> -// CHECK: %[[v1:.*]] = alloc() : memref<1x18x18x64xf32> -// CHECK: affine.parallel (%[[arg0:.*]]) = (0) to (1) { -// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (8) { -// CHECK: affine.parallel (%[[arg2:.*]]) = (0) to (8) { -// CHECK: affine.parallel (%[[arg3:.*]]) = (0) to (64) { -// CHECK: affine.store %[[cst]], %[[v0]][%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]]] : memref<1x8x8x64xf32> -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: affine.parallel (%[[a0:.*]]) = (0) to (1) { -// CHECK: affine.parallel (%[[a1:.*]]) = (0) to (8) { -// CHECK: affine.parallel (%[[a2:.*]]) = (0) to (8) { -// CHECK: affine.parallel (%[[a3:.*]]) = (0) to (64) { -// CHECK: affine.parallel (%[[a4:.*]]) = (0) to (1) { -// CHECK: affine.for %[[a5:.*]] = 0 to 3 { -// CHECK: affine.for %[[a6:.*]] = 0 to 3 { -// CHECK: affine.parallel (%[[a7:.*]]) = (0) to (1) { -// CHECK: %[[lhs:.*]] = affine.load %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32> -// CHECK: %[[rhs:.*]] = affine.load %[[v1]][%[[a0]] + %[[a4]], %[[a1]] * 2 + %[[a5]], %[[a2]] * 2 + %[[a6]], %[[a3]] + %[[a7]]] : memref<1x18x18x64xf32> -// CHECK: %[[res:.*]] = cmpf "ogt", %[[lhs]], %[[rhs]] : f32 -// CHECK: %[[sel:.*]] = select %[[res]], %[[lhs]], %[[rhs]] : f32 -// CHECK: affine.store %[[sel]], %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32> -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } - -func @loop_nest_3d_outer_two_parallel(%N : index) { - %0 = alloc() : memref<1024 x 1024 x vector<64xf32>> - %1 = alloc() : memref<1024 x 1024 x vector<64xf32>> - %2 = alloc() : memref<1024 x 1024 x vector<64xf32>> - affine.for %i = 0 to %N { - affine.for %j = 0 to %N { - %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>> - affine.for %k = 0 to %N { - %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>> - %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>> - %8 = mulf %5, %6 : vector<64xf32> - %9 = addf %7, %8 : vector<64xf32> - affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>> - } - } - } - return -} - -// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (symbol(%arg0)) { -// CHECK-NEXT: affine.parallel (%[[arg2:.*]]) = (0) to (symbol(%arg0)) { -// CHECK: affine.for %[[arg3:.*]] = 0 to %arg0 { - -// CHECK-LABEL: unknown_op_conservative -func @unknown_op_conservative() { - affine.for %i = 0 to 10 { -// CHECK: affine.for %[[arg1:.*]] = 0 to 10 { - "unknown"() : () -> () - } - return -} - -// CHECK-LABEL: non_affine_load -func @non_affine_load() { - %0 = alloc() : memref<100 x f32> - affine.for %i = 0 to 100 { -// CHECK: affine.for %{{.*}} = 0 to 100 { - load %0[%i] : memref<100 x f32> - } - return -} diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt index 3d08fed788e2e..68a0b06e0e318 100644 --- a/mlir/test/lib/Dialect/Affine/CMakeLists.txt +++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt @@ -3,6 +3,7 @@ add_mlir_library(MLIRAffineTransformsTestPasses TestAffineDataCopy.cpp TestAffineLoopUnswitching.cpp TestLoopPermutation.cpp + TestParallelismDetection.cpp TestVectorizationUtils.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp new file mode 100644 index 0000000000000..b19e260316939 --- /dev/null +++ b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp @@ -0,0 +1,47 @@ +//===- ParallelismDetection.cpp - Parallelism Detection pass ------------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to detect parallel affine 'affine.for' ops. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/Utils.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { + +struct TestParallelismDetection + : public PassWrapper { + void runOnFunction() override; +}; + +} // end anonymous namespace + +// Walks the function and emits a note for all 'affine.for' ops detected as +// parallel. +void TestParallelismDetection::runOnFunction() { + FuncOp f = getFunction(); + OpBuilder b(f.getBody()); + f.walk([&](AffineForOp forOp) { + if (isLoopParallel(forOp)) + forOp.emitRemark("parallel loop"); + else + forOp.emitRemark("sequential loop"); + }); +} + +namespace mlir { +void registerTestParallelismDetection() { + PassRegistration pass( + "test-detect-parallel", "Test parallelism detection "); +} +} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index a6da313f155b3..2d753d8fd076e 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -62,6 +62,7 @@ void registerTestMatchers(); void registerTestMemRefDependenceCheck(); void registerTestMemRefStrideCalculation(); void registerTestOpaqueLoc(); +void registerTestParallelismDetection(); void registerTestPreparationPassWithAllowedMemrefResults(); void registerTestGpuParallelLoopMappingPass(); void registerTestSCFUtilsPass(); @@ -136,6 +137,7 @@ void registerTestPasses() { registerTestMemRefDependenceCheck(); registerTestMemRefStrideCalculation(); registerTestOpaqueLoc(); + registerTestParallelismDetection(); registerTestPreparationPassWithAllowedMemrefResults(); registerTestGpuParallelLoopMappingPass(); registerTestSCFUtilsPass();